# Qwen3-32B Difficulty Evaluation

Evaluates the Qwen3-32B thinking model on training data to identify "tough" questions (ones the model gets wrong) and outputs a CSV with difficulty classifications.

**Output CSV columns:**
- `idx`: Row index from training data
- `prediction`: Model's predicted label
- `golden_label`: Ground truth from training data
- `difficulty`: "hard" or "easy"
- `raw_output`: Full model response (including thinking)
- `question_snippet`: First 200 chars of the question

In [None]:
# Install dependencies if needed
%pip install transformers accelerate pandas tqdm scikit-learn -q

In [None]:
import re
import warnings

import pandas as pd
import torch
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

# Suppress attention mask warnings
warnings.filterwarnings("ignore", message=".*attention_mask.*")

# Check available GPUs
print(f"Available GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f"  GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f} GB)")

In [None]:
# ==================== CONFIGURATION ====================
# Change these as needed

MODEL_NAME = "Qwen/Qwen3-32B"          # Qwen3-32B with thinking mode
DATA_PATH = "data/train/train.parquet" # Training data
OUTPUT_PATH = "difficulty_analysis.csv" # Output file

# Generation parameters for thinking mode
MAX_NEW_TOKENS = 1024   # Longer for thinking mode output
TEMPERATURE = 0.6       # Qwen3 recommended for thinking
TOP_P = 0.95
TOP_K = 20
MAX_INPUT_LENGTH = 6144  # Increased for few-shot examples

# Enable thinking mode (set to False for standard generation)
ENABLE_THINKING = True

# Set to a number for testing (e.g., 10), None for full run
LIMIT = None

# Valid labels for classification
VALID_LABELS = {"Clear Reply", "Clear Non-Reply", "Ambivalent"}

# Few-shot examples for classification
FEW_SHOT_EXAMPLES = """
Here are examples for each classification category:

### Example 1: Clear Reply ###
Question: Do you have your own views about PR at Westminster don't you?
Answer: I do.
Label: Clear Reply
Explanation: The answer directly gives the info requested.

### Example 2: Ambivalent ###
Question: Are you going to watch television?
Answer: What else is there to do?
Label: Ambivalent
Explanation: They suggest planning to watch TV, despite not explicitly stating it.

### Example 3: Ambivalent ###
Question: Do you like my new dress?
Answer: We are late.
Label: Ambivalent
Explanation: Does not even acknowledge the question and goes straight to another topic.

### Example 4: Ambivalent ###
Question: Did you eat the last piece of pie?
Answer: I have to admit that this was a great recipe, I always like it when there are chocolate chips in the dough.
Label: Ambivalent
Explanation: Acknowledges the question but goes on a tangent about the chips, without answering.

### Example 5: Ambivalent ###
Question: Did you enjoy the film?
Answer: The directing was great.
Label: Ambivalent
Explanation: Directing is only part of what constitutes a film.

### Example 6: Ambivalent ###
Question: What's your favorite film?
Answer: Fight Club, Filth, and Hereditary.
Label: Ambivalent
Explanation: The reply gives three movies instead of one, which makes the desired information unclear.

### Example 7: Clear Non-Reply ###
Question: The hypothesis I was discussing, wouldn't you regard that as a defeat?
Answer: I am not going to prophesy what will happen.
Label: Clear Non-Reply
Explanation: Directly stating they won't answer.

### Example 8: Clear Non-Reply ###
Question: On what precise date did the government order the refit of the HMAS Kanimbla in preparation for its forward deployment to a possible war against Iraq?
Answer: I do not know that date. I will find out and let the House know.
Label: Clear Non-Reply
Explanation: Claims/admits they don't have the information.

### Example 9: Clear Non-Reply ###
Question: Was it your decision to release the fund?
Answer: You mean the public fund?
Label: Clear Non-Reply
Explanation: Gives no data, asks for clarification.
"""

In [None]:
# ==================== HELPER FUNCTIONS ====================

def extract_label(response: str) -> str:
    """
    Extract classification label from thinking model response.
    
    Handles both thinking mode output (with <think>...</think> blocks)
    and standard output formats.
    """
    # Remove thinking block if present to get the final answer
    response_clean = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
    response_clean = response_clean.strip()
    
    # Look for explicit LABEL: pattern (common in COT responses)
    match = re.search(
        r'LABEL:\s*(Clear Reply|Clear Non-Reply|Ambivalent)',
        response_clean,
        re.IGNORECASE
    )
    if match:
        label = match.group(1).strip()
        return normalize_label(label)
    
    # Check last 150 chars for the final answer
    response_lower = response_clean.lower()
    last_part = response_lower[-150:] if len(response_lower) > 150 else response_lower
    
    # Priority order: more specific labels first
    if "clear non-reply" in last_part or "non-reply" in last_part:
        return "Clear Non-Reply"
    elif "clear reply" in last_part:
        return "Clear Reply"
    elif "ambivalent" in last_part:
        return "Ambivalent"
    
    # Check entire response if not found in last part
    if "clear non-reply" in response_lower or "non-reply" in response_lower:
        return "Clear Non-Reply"
    elif "clear reply" in response_lower:
        return "Clear Reply"
    elif "ambivalent" in response_lower:
        return "Ambivalent"
    
    return "PARSE_ERROR"


def normalize_label(label: str) -> str:
    """Normalize label to standard format."""
    label_lower = label.lower().strip()
    if "non-reply" in label_lower or "non reply" in label_lower:
        return "Clear Non-Reply"
    elif "clear reply" in label_lower:
        return "Clear Reply"
    elif "ambivalent" in label_lower:
        return "Ambivalent"
    return label


def extract_ground_truth(conversations: list) -> str:
    """Extract ground truth label from the assistant message in conversations."""
    for msg in conversations:
        if msg.get("role") == "assistant":
            return normalize_label(msg.get("content", "").strip())
    return "UNKNOWN"


def extract_question_snippet(conversations: list, max_len: int = 200) -> str:
    """Extract a snippet of the user's question for reference."""
    for msg in conversations:
        if msg.get("role") == "user":
            content = msg.get("content", "")
            # Find the "Specific Question" section if present
            match = re.search(r'### Specific Question[^#]*###\s*(.+?)(?:\n|$)', content, re.DOTALL)
            if match:
                snippet = match.group(1).strip()
            else:
                snippet = content
            
            if len(snippet) > max_len:
                snippet = snippet[:max_len] + "..."
            return snippet
    return ""


def prepare_messages_for_inference(conversations: list, few_shot: str = None) -> list:
    """
    Prepare conversation messages for inference.
    Removes the assistant message (ground truth) and optionally adds few-shot examples.
    """
    messages = []
    for msg in conversations:
        if msg.get("role") == "assistant":
            continue  # Skip ground truth
        elif msg.get("role") == "system" and few_shot:
            # Append few-shot examples to system message
            messages.append({
                "role": "system",
                "content": msg["content"] + "\n\n" + few_shot
            })
        else:
            messages.append({
                "role": msg["role"],
                "content": msg["content"]
            })
    return messages


print("Helper functions defined")

In [None]:
# ==================== LOAD MODEL (2 GPUs) ====================
print("=" * 60)
print(f"Loading model: {MODEL_NAME}")
print("Distributing across available GPUs with FP16...")
print("=" * 60)

# Load tokenizer
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model distributed across GPUs
print("Loading model with device_map='auto' (distributes across all GPUs)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

print("\nModel loaded successfully!")
if hasattr(model, 'hf_device_map'):
    devices = set(model.hf_device_map.values())
    print(f"Model distributed across devices: {devices}")
    device_counts = {}
    for layer, device in model.hf_device_map.items():
        device_counts[device] = device_counts.get(device, 0) + 1
    for device, count in sorted(device_counts.items()):
        print(f"  Device {device}: {count} modules")

In [None]:
# ==================== LOAD TRAINING DATA ====================
print(f"Loading training data from: {DATA_PATH}")
df = pd.read_parquet(DATA_PATH)
print(f"Loaded {len(df)} training examples")
print(f"Columns: {df.columns.tolist()}")

if LIMIT:
    print(f"\nLimiting to first {LIMIT} examples (for testing)")
    df = df.head(LIMIT)

# Preview sample
print("\nSample ground truth labels:")
sample_labels = [extract_ground_truth(row["conversations"]) for _, row in df.head(5).iterrows()]
for i, label in enumerate(sample_labels):
    print(f"  {i+1}. {label}")

In [None]:
# ==================== RUN INFERENCE ====================
results = []
total = len(df)

print("\n" + "=" * 60)
print(f"Starting inference on {total} examples")
print(f"Thinking mode: {'enabled' if ENABLE_THINKING else 'disabled'}")
print("=" * 60 + "\n")

for idx, row in tqdm(df.iterrows(), total=total, desc="Evaluating"):
    conversations = row["conversations"]
    
    # Convert numpy array to list if needed
    if hasattr(conversations, 'tolist'):
        conversations = conversations.tolist()
    
    # Extract ground truth before removing assistant message
    golden_label = extract_ground_truth(conversations)
    question_snippet = extract_question_snippet(conversations)
    
    # Prepare messages for inference (remove assistant message, add few-shot examples)
    messages = prepare_messages_for_inference(conversations, few_shot=FEW_SHOT_EXAMPLES)
    
    # Apply chat template
    try:
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
    except Exception as e:
        tqdm.write(f"[{idx + 1}/{total}] Template error: {e}")
        results.append({
            "idx": idx,
            "prediction": "ERROR",
            "golden_label": golden_label,
            "difficulty": "unknown",
            "raw_output": f"Template error: {e}",
            "question_snippet": question_snippet,
        })
        continue
    
    # Tokenize
    encoded = tokenizer(prompt, return_tensors="pt")
    inputs = encoded["input_ids"]
    input_len = inputs.shape[-1]
    
    # Skip if too long
    if input_len > MAX_INPUT_LENGTH:
        tqdm.write(f"[{idx + 1}/{total}] SKIPPED (input too long: {input_len} tokens)")
        results.append({
            "idx": idx,
            "prediction": "SKIPPED",
            "golden_label": golden_label,
            "difficulty": "unknown",
            "raw_output": f"Input too long: {input_len} tokens",
            "question_snippet": question_snippet,
        })
        continue
    
    try:
        # Move to model's device
        inputs = inputs.to(model.device)
        attention_mask = torch.ones_like(inputs)
        
        # Generate with thinking-optimized parameters
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs,
                attention_mask=attention_mask,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=TEMPERATURE,
                top_p=TOP_P,
                top_k=TOP_K,
                do_sample=True,
                use_cache=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        
        # Decode only the new tokens (preserve <think> tags)
        full_response = tokenizer.decode(
            outputs[0, input_len:],
            skip_special_tokens=False
        ).strip()
        
        # Extract prediction label
        prediction = extract_label(full_response)
        
        # Determine difficulty
        if prediction in VALID_LABELS and golden_label in VALID_LABELS:
            difficulty = "easy" if prediction == golden_label else "hard"
        else:
            difficulty = "unknown"
        
        results.append({
            "idx": idx,
            "prediction": prediction,
            "golden_label": golden_label,
            "difficulty": difficulty,
            "raw_output": full_response,
            "question_snippet": question_snippet,
        })
        
    except Exception as e:
        tqdm.write(f"[{idx + 1}/{total}] ERROR: {e}")
        results.append({
            "idx": idx,
            "prediction": "ERROR",
            "golden_label": golden_label,
            "difficulty": "unknown",
            "raw_output": str(e),
            "question_snippet": question_snippet,
        })
    
    # Clear CUDA cache periodically
    if (idx + 1) % 25 == 0:
        torch.cuda.empty_cache()

print("\nInference complete!")

In [None]:
# ==================== CREATE RESULTS DATAFRAME ====================
results_df = pd.DataFrame(results)

# Save to CSV
results_df.to_csv(OUTPUT_PATH, index=False)
print(f"Saved results to: {OUTPUT_PATH}")
print(f"DataFrame shape: {results_df.shape}")

# Preview results
results_df.head(10)

In [None]:
# ==================== DIFFICULTY ANALYSIS ====================
print("=" * 60)
print("DIFFICULTY ANALYSIS SUMMARY")
print("=" * 60)

total = len(results_df)

# Count by difficulty
difficulty_counts = results_df["difficulty"].value_counts()

print(f"\nTotal examples: {total}")
print("\nDifficulty Distribution:")
for diff in ["easy", "hard", "unknown"]:
    count = difficulty_counts.get(diff, 0)
    pct = count / total * 100 if total > 0 else 0
    print(f"  {diff:10}: {count:5} ({pct:5.1f}%)")

# Filter to valid predictions only
valid_mask = results_df["difficulty"].isin(["easy", "hard"])
valid_df = results_df[valid_mask]

if len(valid_df) > 0:
    # Accuracy
    accuracy = (valid_df["difficulty"] == "easy").sum() / len(valid_df)
    print(f"\nModel Accuracy: {accuracy:.4f}")
    
    # Breakdown by ground truth label
    print("\nDifficulty by Ground Truth Label:")
    for label in VALID_LABELS:
        label_df = valid_df[valid_df["golden_label"] == label]
        if len(label_df) > 0:
            hard_count = (label_df["difficulty"] == "hard").sum()
            hard_pct = hard_count / len(label_df) * 100
            print(f"  {label:20}: {hard_count:4}/{len(label_df):4} hard ({hard_pct:5.1f}%)")

In [None]:
# ==================== CLASSIFICATION METRICS ====================
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    f1_score,
    classification_report,
    confusion_matrix
)

# Filter to valid predictions only
y_true = results_df["golden_label"]
y_pred = results_df["prediction"]

mask = y_true.isin(VALID_LABELS) & y_pred.isin(VALID_LABELS)
y_true_filtered = y_true[mask]
y_pred_filtered = y_pred[mask]

print(f"Valid predictions: {mask.sum()} / {len(mask)}")
print(f"Filtered out: {(~mask).sum()} (PARSE_ERROR, SKIPPED, ERROR)")

if len(y_true_filtered) > 0:
    # Accuracy
    accuracy = accuracy_score(y_true_filtered, y_pred_filtered)
    
    # Precision, Recall, F1 (Macro)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_true_filtered,
        y_pred_filtered,
        average="macro",
        zero_division=0
    )
    
    # Weighted F1
    f1_weighted = f1_score(
        y_true_filtered,
        y_pred_filtered,
        average="weighted",
        zero_division=0
    )
    
    print("\n" + "=" * 60)
    print("CLASSIFICATION METRICS")
    print("=" * 60)
    print(f"\nAccuracy        : {accuracy:.4f}")
    print(f"Precision (Mac) : {precision_macro:.4f}")
    print(f"Recall (Mac)    : {recall_macro:.4f}")
    print(f"F1 (Macro)      : {f1_macro:.4f}")
    print(f"F1 (Weighted)   : {f1_weighted:.4f}")
    
    # Detailed classification report
    print("\n" + "=" * 60)
    print("CLASSIFICATION REPORT")
    print("=" * 60)
    print(classification_report(
        y_true_filtered,
        y_pred_filtered,
        labels=list(VALID_LABELS),
        zero_division=0
    ))
    
    # Confusion Matrix
    labels = ["Clear Reply", "Clear Non-Reply", "Ambivalent"]
    cm = confusion_matrix(y_true_filtered, y_pred_filtered, labels=labels)
    
    print("\nConfusion Matrix:")
    print("-" * 60)
    print(f"{'':20} {'Pred CR':>12} {'Pred CNR':>12} {'Pred Amb':>12}")
    print("-" * 60)
    for i, true_label in enumerate(labels):
        row = f"{true_label:20}"
        for j in range(3):
            row += f" {cm[i][j]:>12}"
        print(row)
    print("-" * 60)
else:
    print("No valid predictions to compute metrics!")

In [None]:
# ==================== PROCESSING ISSUES ====================
parse_errors = (results_df["prediction"] == "PARSE_ERROR").sum()
skipped = (results_df["prediction"] == "SKIPPED").sum()
errors = (results_df["prediction"] == "ERROR").sum()

print("Processing Issues:")
print(f"  Parse errors: {parse_errors}")
print(f"  Skipped:      {skipped}")
print(f"  Errors:       {errors}")

# Show breakdown of predictions
print("\nPrediction Distribution:")
print(results_df["prediction"].value_counts())

In [None]:
# ==================== HARD QUESTIONS ANALYSIS ====================
hard_df = results_df[results_df["difficulty"] == "hard"]

print(f"Total HARD questions: {len(hard_df)}")
print("\nSample hard questions (first 10):")
print("-" * 80)

for i, row in hard_df.head(10).iterrows():
    print(f"\nIdx: {row['idx']}")
    print(f"Golden Label: {row['golden_label']}")
    print(f"Prediction:   {row['prediction']}")
    print(f"Question:     {row['question_snippet'][:100]}...")
    raw_preview = row['raw_output'][:150] if len(row['raw_output']) > 150 else row['raw_output']
    print(f"Raw Output:   {raw_preview}...")
    print("-" * 80)

In [None]:
# ==================== EXPORT HARD QUESTIONS ====================
hard_output_path = OUTPUT_PATH.replace(".csv", "_hard_only.csv")
hard_df.to_csv(hard_output_path, index=False)
print(f"Saved {len(hard_df)} hard questions to: {hard_output_path}")

In [None]:
# ==================== CLEANUP ====================
# Free GPU memory
del model
torch.cuda.empty_cache()
print("GPU memory cleared")