# üéØ Objective Clustering Evaluation

**Problem Identified in Notebook 06:**
- K=2-3 gives highest silhouette (0.595-0.604)
- But these clusters are **dominated by dataset sources** (99% in one cluster)
- Silhouette score is **misleading** - measures geometric separation, not routing utility

## üî¨ Goal: Find Optimal K Using Objective Metrics

We'll use **quantifiable metrics** instead of heuristic labels:

### 1. Dataset Mixing Entropy ü•á (Most Important)
- **What**: Measures if clusters are just separating datasets
- **Target**: > 1.0 (higher = better mixing)
- **Why**: Clusters should group by task properties, not dataset source

### 2. Cluster Balance (Gini Coefficient) ü•â
- **What**: Measures inequality in cluster sizes
- **Target**: < 0.3 (lower = more balanced)
- **Why**: Need enough samples per cluster for error rate estimation

### 3. Prompt Length Variance üìè
- **What**: Variance of prompt lengths within clusters
- **Target**: Lower is better (clusters group similar complexity)
- **Why**: Prompt length correlates with task complexity

### 4. Combined Objective Score üéØ
- Weighted combination of all metrics
- Finds K that maximizes routing utility, not just geometric separation

---
**‚ö° GPU recommended for embeddings**

## 0. üîß Setup

In [None]:
# Install packages with version compatibility for Colab T4 GPU
# Colab now uses NumPy 2.x as default for many packages
!pip install -q --upgrade pip

# Use NumPy 2.x to avoid conflicts with Colab pre-installed packages
!pip install -q 'numpy>=2.0.0'

# Core ML packages - latest versions compatible with NumPy 2.x
!pip install -q 'transformers>=4.40.0'
!pip install -q 'datasets>=2.18.0'
!pip install -q 'scikit-learn>=1.4.0'
!pip install -q 'scipy>=1.12.0'

# Visualization packages
!pip install -q 'matplotlib>=3.8.0'
!pip install -q 'seaborn>=0.13.0'
!pip install -q 'pandas>=2.2.0'

# Verify installations
import torch
import numpy as np
import transformers
import sklearn

print(f'‚úÖ Packages installed!')
print(f'NumPy version: {np.__version__}')
print(f'PyTorch version: {torch.__version__}')
print(f'Transformers version: {transformers.__version__}')
print(f'scikit-learn version: {sklearn.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'CUDA version: {torch.version.cuda}')

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

if device == 'cuda':
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    props = torch.cuda.get_device_properties(0)
    print(f'Memory: {props.total_memory / 1e9:.1f} GB')
    print(f'Compute Capability: {props.major}.{props.minor}')
    
    # T4 GPU specific optimizations
    if 'T4' in torch.cuda.get_device_name(0):
        print('‚úÖ T4 GPU detected - optimized for mixed precision training')
        print('   Recommendation: Use batch_size=32-64 for best performance')
    
    # Clear cache to start fresh
    torch.cuda.empty_cache()
    print(f'Available memory: {torch.cuda.mem_get_info()[0] / 1e9:.1f} GB')
else:
    print('üíª CPU mode')
    print('   üí° Enable GPU: Runtime ‚Üí Change runtime type ‚Üí T4 GPU')

## 1. üì¶ Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
import json
import time
warnings.filterwarnings('ignore')

# Clustering
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.stats import entropy

# Metrics
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize

# Embeddings
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
import torch

# Viz
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

np.random.seed(42)

print('‚úÖ Imports complete!')

## 2. üì• Load Coding Datasets

In [None]:
def load_coding_datasets(max_total=4000):
    """
    Load diverse coding datasets with source tracking.
    """
    questions = []

    print("="*70)
    print("LOADING CODING DATASETS")
    print("="*70)

    # 1. SWE-bench
    print("\n1. Loading SWE-bench (GitHub issues)...")
    try:
        swe_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
        count = 0
        target = min(2000, len(swe_dataset))

        for idx, item in enumerate(swe_dataset):
            if count >= target:
                break

            problem = item.get("problem_statement", "")
            repo = item.get("repo", "")

            if problem:
                questions.append({
                    "question": problem,
                    "source": "swe_bench",
                    "repo": repo,
                    "prompt_length": len(problem),
                    "word_count": len(problem.split())
                })
                count += 1

        print(f"   ‚úì Loaded {count} GitHub issues")

    except Exception as e:
        print(f"   ‚úó Error: {e}")

    # 2. DS-1000
    print("\n2. Loading DS-1000 (Data science tasks)...")
    try:
        ds_dataset = load_dataset("xlangai/DS-1000", split="test")
        count = 0

        for item in ds_dataset:
            prompt = item.get("prompt", "")
            metadata = item.get("metadata", {})
            library = metadata.get("library", "unknown") if isinstance(metadata, dict) else "unknown"

            if prompt:
                questions.append({
                    "question": prompt,
                    "source": "ds1000",
                    "library": library,
                    "prompt_length": len(prompt),
                    "word_count": len(prompt.split())
                })
                count += 1

        print(f"   ‚úì Loaded {count} data science tasks")

    except Exception as e:
        print(f"   ‚úó Error: {e}")

    # 3. BigCodeBench
    print("\n3. Loading BigCodeBench (API tasks)...")
    try:
        bigcode_dataset = load_dataset("bigcode/bigcodebench", split="v0.1.2")
        count = 0
        target = min(500, len(bigcode_dataset))

        for idx, item in enumerate(bigcode_dataset):
            if count >= target:
                break

            complete_prompt = item.get("complete_prompt", "")
            instruct_prompt = item.get("instruct_prompt", "")
            prompt = instruct_prompt if instruct_prompt else complete_prompt

            if prompt:
                questions.append({
                    "question": prompt,
                    "source": "bigcodebench",
                    "prompt_length": len(prompt),
                    "word_count": len(prompt.split())
                })
                count += 1

        print(f"   ‚úì Loaded {count} API tasks")

    except Exception as e:
        print(f"   ‚úó Error: {e}")

    # 4. DebugBench
    print("\n4. Loading DebugBench (Debugging tasks)...")
    try:
        debug_dataset = load_dataset("Rtian/DebugBench", split="test")
        count = 0
        target = min(500, len(debug_dataset))

        for idx, item in enumerate(debug_dataset):
            if count >= target:
                break

            buggy_code = item.get("buggy_code", "")
            language = item.get("language", "python").lower()

            if buggy_code:
                prompt = f"Debug this code:\n{buggy_code}"
                questions.append({
                    "question": prompt,
                    "source": "debugbench",
                    "language": language,
                    "prompt_length": len(prompt),
                    "word_count": len(prompt.split())
                })
                count += 1

        print(f"   ‚úì Loaded {count} debugging tasks")

    except Exception as e:
        print(f"   ‚úó Error: {e}")

    print(f"\n{'='*70}")
    print(f"‚úÖ Total: {len(questions)} coding tasks")
    print(f"\nDataset Distribution:")
    sources = [q['source'] for q in questions]
    source_counts = Counter(sources)
    for source, count in source_counts.most_common():
        pct = count / len(questions) * 100
        print(f"  {source}: {count} ({pct:.1f}%)")
    print(f"{'='*70}")

    return questions

# Load data
questions = load_coding_datasets(max_total=4000)
texts = [q['question'] for q in questions]

# Create DataFrame for analysis
df = pd.DataFrame(questions)

## 3. üß† Extract CodeBERT Embeddings

In [None]:
def mean_pooling(token_embeddings, attention_mask):
    """Mean pooling - take average of all tokens"""
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def encode_with_codebert(model, tokenizer, texts, device, batch_size=32):
    """Encode texts using CodeBERT"""
    model.eval()
    all_embeddings = []
    
    print(f'üöÄ Encoding {len(texts)} texts with CodeBERT...')
    
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            ).to(device)
            
            outputs = model(**encoded)
            embeddings = mean_pooling(outputs.last_hidden_state, encoded['attention_mask'])
            all_embeddings.append(embeddings.cpu().numpy())
            
            if (i // batch_size) % 10 == 0:
                print(f'  Processed {min(i+batch_size, len(texts))}/{len(texts)}', end='\r')
    
    print(f'  Processed {len(texts)}/{len(texts)} ‚úì')
    return np.vstack(all_embeddings)

print("="*70)
print("LOADING CODEBERT")
print("="*70)

tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')
model = AutoModel.from_pretrained('microsoft/codebert-base').to(device)

codebert_embeddings = encode_with_codebert(model, tokenizer, texts, device, batch_size=32)
codebert_norm = normalize(codebert_embeddings, norm='l2')

print(f"\n‚úÖ CodeBERT embeddings: {codebert_norm.shape}")
print(f"   Mean: {codebert_norm.mean():.4f}, Std: {codebert_norm.std():.4f}")

# Free memory
del model
del tokenizer
if device == 'cuda':
    torch.cuda.empty_cache()

print("="*70)

## 4. üìä Define Objective Evaluation Metrics

In [None]:
def dataset_mixing_entropy(cluster_labels, dataset_sources):
    """
    Measures how well datasets are mixed within clusters.
    
    Returns:
    - avg_entropy: Average entropy across all clusters (higher = better mixing)
    - max_dataset_pct: Average maximum dataset percentage per cluster (lower = better)
    - per_cluster_stats: Detailed stats for each cluster
    """
    cluster_stats = []
    
    for cluster_id in np.unique(cluster_labels):
        mask = cluster_labels == cluster_id
        cluster_sources = dataset_sources[mask]
        
        # Count datasets in this cluster
        source_counts = Counter(cluster_sources)
        total = sum(source_counts.values())
        
        # Calculate entropy (higher = more diverse)
        probs = np.array([count/total for count in source_counts.values()])
        cluster_entropy = entropy(probs)
        
        # Max percentage (lower = more balanced)
        max_pct = max(source_counts.values()) / total
        
        cluster_stats.append({
            'cluster': cluster_id,
            'size': total,
            'entropy': cluster_entropy,
            'max_dataset_pct': max_pct,
            'dominant_dataset': source_counts.most_common(1)[0][0],
            'num_datasets': len(source_counts)
        })
    
    stats_df = pd.DataFrame(cluster_stats)
    
    return {
        'avg_entropy': stats_df['entropy'].mean(),
        'avg_max_dataset_pct': stats_df['max_dataset_pct'].mean(),
        'per_cluster_stats': stats_df
    }

def gini_coefficient(cluster_sizes):
    """
    Measures inequality in cluster sizes.
    
    Returns:
    - 0.0: Perfect equality (all clusters same size)
    - 1.0: Maximum inequality (all samples in one cluster)
    """
    cluster_sizes = np.array(sorted(cluster_sizes))
    n = len(cluster_sizes)
    index = np.arange(1, n + 1)
    return (2 * np.sum(index * cluster_sizes)) / (n * np.sum(cluster_sizes)) - (n + 1) / n

def prompt_length_variance(cluster_labels, prompt_lengths):
    """
    Measures variance of prompt lengths within clusters.
    Lower = clusters group similar-length prompts (likely similar complexity)
    """
    total_variance = 0
    
    for cluster_id in np.unique(cluster_labels):
        mask = cluster_labels == cluster_id
        cluster_lengths = prompt_lengths[mask]
        
        # Weighted variance
        total_variance += np.var(cluster_lengths) * len(cluster_lengths)
    
    return total_variance / len(cluster_labels)

def word_count_variance(cluster_labels, word_counts):
    """
    Measures variance of word counts within clusters.
    Lower = clusters group similar-complexity prompts
    """
    total_variance = 0
    
    for cluster_id in np.unique(cluster_labels):
        mask = cluster_labels == cluster_id
        cluster_words = word_counts[mask]
        
        # Weighted variance
        total_variance += np.var(cluster_words) * len(cluster_words)
    
    return total_variance / len(cluster_labels)

def compute_objective_score(cluster_labels, dataset_sources, prompt_lengths, word_counts):
    """
    Compute combined objective score for clustering quality.
    
    Higher score = better for routing!
    """
    # 1. Dataset mixing entropy (weight: 40%)
    mixing = dataset_mixing_entropy(cluster_labels, dataset_sources)
    entropy_score = mixing['avg_entropy'] / 1.5  # Normalize (max ~1.5 for 4 datasets)
    
    # 2. Cluster balance (weight: 30%)
    cluster_sizes = list(Counter(cluster_labels).values())
    gini = gini_coefficient(cluster_sizes)
    balance_score = 1 - gini  # Invert so higher is better
    
    # 3. Prompt length homogeneity (weight: 15%)
    length_var = prompt_length_variance(cluster_labels, prompt_lengths)
    # Normalize by overall variance
    overall_length_var = np.var(prompt_lengths)
    length_score = 1 - (length_var / overall_length_var)
    
    # 4. Word count homogeneity (weight: 15%)
    word_var = word_count_variance(cluster_labels, word_counts)
    overall_word_var = np.var(word_counts)
    word_score = 1 - (word_var / overall_word_var)
    
    # Combined score (weighted average)
    objective_score = (
        0.40 * entropy_score +
        0.30 * balance_score +
        0.15 * length_score +
        0.15 * word_score
    )
    
    return {
        'objective_score': objective_score,
        'entropy_score': entropy_score,
        'balance_score': balance_score,
        'length_score': length_score,
        'word_score': word_score,
        'avg_entropy': mixing['avg_entropy'],
        'gini': gini,
        'avg_max_dataset_pct': mixing['avg_max_dataset_pct']
    }

print('‚úÖ Objective metrics defined!')

## 5. üî¨ Comprehensive Clustering Evaluation

In [None]:
print("="*70)
print("OBJECTIVE CLUSTERING EVALUATION")
print("="*70)
print("Testing: complete, average, ward linkages")
print("K values: 2, 3, 4, 5, 6, 8, 10, 12, 15, 20, 25, 30\n")

linkages = ['ward', 'complete', 'average']
k_values = [2, 3, 4, 5, 6, 8, 10, 12, 15, 20, 25, 30]

# Prepare data
dataset_sources = df['source'].values
prompt_lengths = df['prompt_length'].values
word_counts = df['word_count'].values

results = []

for linkage_method in linkages:
    print(f"\n{'='*70}")
    print(f"Testing linkage: {linkage_method.upper()}")
    print(f"{'='*70}")
    
    for k in k_values:
        try:
            start_time = time.time()
            
            # Fit clustering
            agg = AgglomerativeClustering(
                n_clusters=k,
                linkage=linkage_method,
                metric='euclidean' if linkage_method == 'ward' else 'cosine'
            )
            
            labels = agg.fit_predict(codebert_norm)
            elapsed = time.time() - start_time
            
            # Compute all metrics
            obj_scores = compute_objective_score(
                labels, dataset_sources, prompt_lengths, word_counts
            )
            
            # Also compute silhouette for comparison
            sil = silhouette_score(codebert_norm, labels, metric='cosine')
            
            # Cluster sizes
            cluster_sizes = Counter(labels)
            size_distribution = {f'cluster_{i}': cluster_sizes.get(i, 0) for i in range(k)}
            
            result = {
                'linkage': linkage_method,
                'k': k,
                'objective_score': obj_scores['objective_score'],
                'entropy_score': obj_scores['entropy_score'],
                'balance_score': obj_scores['balance_score'],
                'length_score': obj_scores['length_score'],
                'word_score': obj_scores['word_score'],
                'avg_entropy': obj_scores['avg_entropy'],
                'gini': obj_scores['gini'],
                'avg_max_dataset_pct': obj_scores['avg_max_dataset_pct'],
                'silhouette': sil,
                'time_sec': elapsed
            }
            
            results.append(result)
            
            # Print results
            print(f"\nK={k:2d}:")
            print(f"  Objective Score: {obj_scores['objective_score']:.4f}")
            print(f"  ‚îî‚îÄ Entropy:      {obj_scores['entropy_score']:.4f} (avg={obj_scores['avg_entropy']:.3f})")
            print(f"  ‚îî‚îÄ Balance:      {obj_scores['balance_score']:.4f} (gini={obj_scores['gini']:.3f})")
            print(f"  ‚îî‚îÄ Length:       {obj_scores['length_score']:.4f}")
            print(f"  ‚îî‚îÄ Word:         {obj_scores['word_score']:.4f}")
            print(f"  Silhouette:      {sil:.4f} (for reference)")
            print(f"  Max Dataset %:   {obj_scores['avg_max_dataset_pct']:.1%}")
            print(f"  Time:            {elapsed:.2f}s")
            
            # Show if dataset-dominated
            if obj_scores['avg_max_dataset_pct'] > 0.7:
                print(f"  ‚ö†Ô∏è  WARNING: Clusters dominated by single datasets!")
            elif obj_scores['avg_entropy'] > 1.0:
                print(f"  ‚úÖ GOOD: Datasets well-mixed across clusters")
            
        except Exception as e:
            print(f"  K={k:2d}: Failed - {str(e)[:50]}")

results_df = pd.DataFrame(results)

print(f"\n{'='*70}")
print("TOP 10 BY OBJECTIVE SCORE")
print(f"{'='*70}")
top_results = results_df.nlargest(10, 'objective_score')
print(top_results[['linkage', 'k', 'objective_score', 'avg_entropy', 'gini', 'silhouette']])

best_result = results_df.loc[results_df['objective_score'].idxmax()]
print(f"\n‚úÖ BEST CONFIGURATION:")
print(f"   Linkage: {best_result['linkage']}")
print(f"   K: {int(best_result['k'])}")
print(f"   Objective Score: {best_result['objective_score']:.4f}")
print(f"   Avg Entropy: {best_result['avg_entropy']:.3f} {'‚úÖ' if best_result['avg_entropy'] > 1.0 else '‚ö†Ô∏è'}")
print(f"   Gini: {best_result['gini']:.3f} {'‚úÖ' if best_result['gini'] < 0.3 else '‚ö†Ô∏è'}")
print(f"   Silhouette: {best_result['silhouette']:.4f}")

## 6. üìä Objective Score vs Silhouette Comparison

In [None]:
print("="*70)
print("OBJECTIVE SCORE VS SILHOUETTE SCORE COMPARISON")
print("="*70)

# Find best by silhouette
best_by_silhouette = results_df.loc[results_df['silhouette'].idxmax()]
best_by_objective = results_df.loc[results_df['objective_score'].idxmax()]

print(f"\nüî∑ BEST BY SILHOUETTE (Traditional Metric):")
print(f"   Config: {best_by_silhouette['linkage']} linkage, K={int(best_by_silhouette['k'])}")
print(f"   Silhouette: {best_by_silhouette['silhouette']:.4f}")
print(f"   Objective Score: {best_by_silhouette['objective_score']:.4f}")
print(f"   Avg Entropy: {best_by_silhouette['avg_entropy']:.3f}")
print(f"   Max Dataset %: {best_by_silhouette['avg_max_dataset_pct']:.1%}")
print(f"   Gini: {best_by_silhouette['gini']:.3f}")

if best_by_silhouette['avg_max_dataset_pct'] > 0.7:
    print(f"   ‚ùå PROBLEM: Just separating datasets!")

print(f"\nüéØ BEST BY OBJECTIVE SCORE (Routing-Focused):")
print(f"   Config: {best_by_objective['linkage']} linkage, K={int(best_by_objective['k'])}")
print(f"   Objective Score: {best_by_objective['objective_score']:.4f}")
print(f"   Avg Entropy: {best_by_objective['avg_entropy']:.3f}")
print(f"   Max Dataset %: {best_by_objective['avg_max_dataset_pct']:.1%}")
print(f"   Gini: {best_by_objective['gini']:.3f}")
print(f"   Silhouette: {best_by_objective['silhouette']:.4f}")

if best_by_objective['avg_entropy'] > 1.0 and best_by_objective['gini'] < 0.3:
    print(f"   ‚úÖ GOOD: Datasets mixed, clusters balanced!")

print(f"\n{'='*70}")
print("KEY INSIGHT:")
print(f"{'='*70}")

if abs(best_by_silhouette['k'] - best_by_objective['k']) > 2:
    print(f"‚ö†Ô∏è  Traditional silhouette score recommends K={int(best_by_silhouette['k'])}")
    print(f"    But objective score shows K={int(best_by_objective['k'])} is better for routing!")
    print(f"\n    Why? Silhouette optimizes for geometric separation,")
    print(f"    but we need dataset mixing and balanced clusters for routing.")
else:
    print(f"‚úÖ Both metrics agree on K~{int(best_by_objective['k'])}")

## 7. üìä Visualizations

In [None]:
# Create subplots for different metrics
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

for linkage_method in ['ward', 'complete', 'average']:
    subset = results_df[results_df['linkage'] == linkage_method]
    
    # Objective Score
    axes[0, 0].plot(subset['k'], subset['objective_score'], marker='o', label=linkage_method)
    
    # Entropy
    axes[0, 1].plot(subset['k'], subset['avg_entropy'], marker='o', label=linkage_method)
    
    # Gini
    axes[1, 0].plot(subset['k'], subset['gini'], marker='o', label=linkage_method)
    
    # Silhouette (for comparison)
    axes[1, 1].plot(subset['k'], subset['silhouette'], marker='o', label=linkage_method)

# Objective Score
axes[0, 0].set_title('Objective Score (Higher = Better)', fontweight='bold', fontsize=12)
axes[0, 0].set_xlabel('Number of Clusters (K)')
axes[0, 0].set_ylabel('Objective Score')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Entropy
axes[0, 1].set_title('Dataset Mixing Entropy (Higher = Better)', fontweight='bold', fontsize=12)
axes[0, 1].set_xlabel('Number of Clusters (K)')
axes[0, 1].set_ylabel('Average Entropy')
axes[0, 1].axhline(y=1.0, color='green', linestyle='--', alpha=0.7, label='Target (>1.0)')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Gini
axes[1, 0].set_title('Cluster Balance (Lower = Better)', fontweight='bold', fontsize=12)
axes[1, 0].set_xlabel('Number of Clusters (K)')
axes[1, 0].set_ylabel('Gini Coefficient')
axes[1, 0].axhline(y=0.3, color='green', linestyle='--', alpha=0.7, label='Target (<0.3)')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Silhouette
axes[1, 1].set_title('Silhouette Score (For Reference Only)', fontweight='bold', fontsize=12)
axes[1, 1].set_xlabel('Number of Clusters (K)')
axes[1, 1].set_ylabel('Silhouette Score')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: Objective Score vs Silhouette Score
fig, ax = plt.subplots(figsize=(12, 8))

for linkage_method in ['ward', 'complete', 'average']:
    subset = results_df[results_df['linkage'] == linkage_method]
    
    scatter = ax.scatter(
        subset['silhouette'],
        subset['objective_score'],
        s=subset['k'] * 10,  # Size by K
        alpha=0.6,
        label=linkage_method
    )
    
    # Annotate best by objective
    best_idx = subset['objective_score'].idxmax()
    best = subset.loc[best_idx]
    ax.annotate(
        f"K={int(best['k'])}",
        (best['silhouette'], best['objective_score']),
        xytext=(10, 10),
        textcoords='offset points',
        fontsize=10,
        fontweight='bold'
    )

ax.set_xlabel('Silhouette Score (Traditional Metric)', fontsize=12)
ax.set_ylabel('Objective Score (Routing-Focused)', fontsize=12)
ax.set_title('Objective Score vs Silhouette Score\n(Bubble size = K)', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Add diagonal line
ax.plot([0, 1], [0, 1], 'k--', alpha=0.3, label='Equal scores')

plt.tight_layout()
plt.show()

print("üí° Points in upper-left: High objective but low silhouette = Good for routing!")
print("   Points in lower-right: High silhouette but low objective = Just separating datasets!")

## 8. üîç Detailed Analysis of Best Configuration

In [None]:
# Recompute best clustering for detailed analysis
best_linkage = best_result['linkage']
best_k = int(best_result['k'])

print("="*70)
print(f"DETAILED ANALYSIS: {best_linkage.upper()} LINKAGE, K={best_k}")
print("="*70)

agg_best = AgglomerativeClustering(
    n_clusters=best_k,
    linkage=best_linkage,
    metric='euclidean' if best_linkage == 'ward' else 'cosine'
)

labels_best = agg_best.fit_predict(codebert_norm)
df['cluster'] = labels_best

# Per-cluster analysis
print(f"\nPER-CLUSTER STATISTICS:")
print(f"{'='*70}")

for cluster_id in range(best_k):
    cluster_df = df[df['cluster'] == cluster_id]
    
    print(f"\nCluster {cluster_id}:")
    print(f"  Size: {len(cluster_df)} ({len(cluster_df)/len(df)*100:.1f}%)")
    
    # Dataset distribution
    source_counts = cluster_df['source'].value_counts()
    print(f"  Dataset distribution:")
    for source, count in source_counts.items():
        pct = count / len(cluster_df) * 100
        print(f"    {source}: {count} ({pct:.1f}%)")
    
    # Dataset entropy for this cluster
    probs = source_counts.values / source_counts.sum()
    cluster_entropy = entropy(probs)
    print(f"  Entropy: {cluster_entropy:.3f} {'‚úÖ' if cluster_entropy > 0.8 else '‚ö†Ô∏è'}")
    
    # Prompt length stats
    print(f"  Prompt length: mean={cluster_df['prompt_length'].mean():.0f}, std={cluster_df['prompt_length'].std():.0f}")
    print(f"  Word count: mean={cluster_df['word_count'].mean():.0f}, std={cluster_df['word_count'].std():.0f}")

print(f"\n{'='*70}")

## 9. üíæ Export Results

In [None]:
# Save results
results_df.to_csv('objective_clustering_results.csv', index=False)

# Save best configuration
best_config = {
    'experiment': 'objective_clustering_evaluation',
    'embedding_model': 'CodeBERT',
    'best_configuration': {
        'linkage': best_linkage,
        'k': best_k,
        'metric': 'euclidean' if best_linkage == 'ward' else 'cosine',
        'objective_score': float(best_result['objective_score']),
        'avg_entropy': float(best_result['avg_entropy']),
        'gini': float(best_result['gini']),
        'avg_max_dataset_pct': float(best_result['avg_max_dataset_pct']),
        'silhouette': float(best_result['silhouette'])
    },
    'comparison': {
        'best_by_silhouette': {
            'linkage': best_by_silhouette['linkage'],
            'k': int(best_by_silhouette['k']),
            'silhouette': float(best_by_silhouette['silhouette']),
            'objective_score': float(best_by_silhouette['objective_score'])
        },
        'best_by_objective': {
            'linkage': best_by_objective['linkage'],
            'k': int(best_by_objective['k']),
            'objective_score': float(best_by_objective['objective_score']),
            'silhouette': float(best_by_objective['silhouette'])
        }
    },
    'metrics_explanation': {
        'objective_score': 'Combined score (higher = better for routing)',
        'avg_entropy': 'Dataset mixing (>1.0 = good)',
        'gini': 'Cluster balance (<0.3 = good)',
        'avg_max_dataset_pct': 'Max dataset % per cluster (<0.5 = good)'
    }
}

with open('objective_clustering_best_config.json', 'w') as f:
    json.dump(best_config, f, indent=2)

print('‚úÖ Exported:')
print('  - objective_clustering_results.csv')
print('  - objective_clustering_best_config.json')

## 10. üìù Final Summary

In [None]:
print('='*80)
print('OBJECTIVE CLUSTERING EVALUATION - FINAL SUMMARY')
print('='*80)

print(f'\nüìä EXPERIMENTS CONDUCTED:')
print(f'  Linkage methods: ward, complete, average')
print(f'  K values tested: 2, 3, 4, 5, 6, 8, 10, 12, 15, 20, 25, 30')
print(f'  Total configurations: {len(results_df)}')

print(f'\nüèÜ BEST CONFIGURATION (By Objective Score):')
print(f"  Linkage: {best_linkage}")
print(f"  K: {best_k}")
print(f"  Objective Score: {best_result['objective_score']:.4f}")
print(f"  Avg Entropy: {best_result['avg_entropy']:.3f} {'‚úÖ GOOD' if best_result['avg_entropy'] > 1.0 else '‚ö†Ô∏è WEAK'}")
print(f"  Gini: {best_result['gini']:.3f} {'‚úÖ BALANCED' if best_result['gini'] < 0.3 else '‚ö†Ô∏è UNBALANCED'}")
print(f"  Max Dataset %: {best_result['avg_max_dataset_pct']:.1%}")
print(f"  Silhouette: {best_result['silhouette']:.4f}")

print(f'\nüî∑ BEST BY SILHOUETTE (For Comparison):')
print(f"  Linkage: {best_by_silhouette['linkage']}")
print(f"  K: {int(best_by_silhouette['k'])}")
print(f"  Silhouette: {best_by_silhouette['silhouette']:.4f}")
print(f"  Objective Score: {best_by_silhouette['objective_score']:.4f}")
print(f"  Max Dataset %: {best_by_silhouette['avg_max_dataset_pct']:.1%}")

if best_by_silhouette['avg_max_dataset_pct'] > 0.7:
    print(f"  ‚ùå WARNING: This configuration just separates datasets!")

print(f'\nüí° KEY FINDINGS:')
print(f"  1. Silhouette score is MISLEADING for routing tasks")
print(f"  2. Best by silhouette: K={int(best_by_silhouette['k'])} (but {best_by_silhouette['avg_max_dataset_pct']:.0%} dataset-dominated)")
print(f"  3. Best by objective: K={best_k} (datasets mixed, balanced clusters)")
print(f"  4. Dataset mixing entropy is the most important metric")
print(f"  5. Need K>={best_k} for meaningful routing differentiation")

print(f'\nüéØ PRODUCTION RECOMMENDATION:')
print(f"  ‚úÖ Use: {best_linkage} linkage, K={best_k}")
print(f"  Why:")
print(f"    - Datasets well-mixed (entropy={best_result['avg_entropy']:.2f})")
print(f"    - Balanced clusters (gini={best_result['gini']:.2f})")
print(f"    - Groups by task properties, not dataset source")
print(f"    - Sufficient granularity for error rate differentiation")

print(f'\n‚ö†Ô∏è  DO NOT USE:')
if best_by_silhouette['avg_max_dataset_pct'] > 0.7:
    print(f"  ‚ùå K={int(best_by_silhouette['k'])} (high silhouette but just separates datasets)")
    print(f"     This would route ALL {best_by_silhouette['linkage']} tasks to same model!")

print(f'\n{'='*80}')