# SWE-bench Clustering Analysis

## Overview
This notebook performs comprehensive clustering analysis on the SWE-bench dataset to create semantic clusters for model routing.

## Workflow
1. **Data Loading**: Load SWE-bench (all) and SWE-bench Verified
2. **Embedding**: Generate 768-D embeddings using CodeRankEmbed
3. **UMAP**: Reduce to 3D/2D for visualization and clustering
4. **Clustering**: Compare 5 algorithms with parameter sweeps
5. **Evaluation**: Filter by SWE-verified distribution (min 3 per cluster)
6. **Selection**: User selects best configuration
7. **Export**: Save centroids and cluster data

## Requirements
- GPU recommended for embedding generation
- ~30-60 minutes runtime on GPU

---
## Section 1: Setup & Data Loading

In [None]:
# Install dependencies (run once)
# !pip install -q sentence-transformers umap-learn hdbscan plotly datasets scikit-learn pandas numpy tqdm

In [1]:
# Core imports
import os
import json
import time
import warnings
from pathlib import Path
from collections import Counter
from datetime import datetime
from itertools import product

# Data processing
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# Machine Learning
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import normalize

# Metrics
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Visualization
import plotly.graph_objects as go
import plotly.express as px

# Settings
warnings.filterwarnings('ignore')
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Create artifacts directory
ARTIFACTS_DIR = Path("artifacts")
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


Using device: cuda
GPU: NVIDIA L4
GPU Memory: 23.80 GB


In [2]:
# Load SWE-bench full training dataset
print("Loading SWE-bench training dataset...")
start_time = time.time()

dataset_train = load_dataset("princeton-nlp/SWE-bench", split="train")
print(f"Loaded {len(dataset_train)} training instances in {time.time() - start_time:.2f}s")

# Shuffle and convert to DataFrame
dataset_train = dataset_train.shuffle(seed=RANDOM_SEED)
df = pd.DataFrame(dataset_train)

print(f"\nDataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Loading SWE-bench training dataset...


README.md: 0.00B [00:00, ?B/s]

data/dev-00000-of-00001.parquet:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/12.1M [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/107M [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/225 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2294 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/19008 [00:00<?, ? examples/s]

Loaded 19008 training instances in 12.51s

Dataset shape: (19008, 12)
Columns: ['repo', 'instance_id', 'base_commit', 'patch', 'test_patch', 'problem_statement', 'hints_text', 'created_at', 'version', 'FAIL_TO_PASS', 'PASS_TO_PASS', 'environment_setup_commit']


In [3]:
# Load SWE-bench Verified dataset
print("Loading SWE-bench Verified dataset...")
dataset_verified = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
verified_ids = set(item['instance_id'] for item in dataset_verified)
print(f"Loaded {len(verified_ids)} verified instance IDs")

# Mark verified instances in main dataset
df['is_verified'] = df['instance_id'].isin(verified_ids)
verified_count = df['is_verified'].sum()
print(f"\nVerified instances in training set: {verified_count} ({verified_count/len(df)*100:.1f}%)")

Loading SWE-bench Verified dataset...


README.md: 0.00B [00:00, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Loaded 500 verified instance IDs

Verified instances in training set: 0 (0.0%)


In [4]:
# Dataset statistics
print("=" * 80)
print("DATASET STATISTICS")
print("=" * 80)

print(f"\nTotal instances: {len(df):,}")
print(f"Verified instances: {verified_count:,}")
print(f"Number of repositories: {df['repo'].nunique()}")

# Top repositories
print("\nTop 10 repositories by instance count:")
repo_counts = df['repo'].value_counts().head(10)
for repo, count in repo_counts.items():
    verified_in_repo = df[(df['repo'] == repo) & df['is_verified']].shape[0]
    print(f"  {repo}: {count:,} ({verified_in_repo} verified)")

# Verified distribution by repo
print("\nVerified instances by repository:")
verified_by_repo = df[df['is_verified']]['repo'].value_counts().head(10)
for repo, count in verified_by_repo.items():
    print(f"  {repo}: {count}")

# Text length statistics
df['problem_length'] = df['problem_statement'].str.len()
print(f"\nProblem statement length (chars):")
print(f"  Mean: {df['problem_length'].mean():.0f}")
print(f"  Median: {df['problem_length'].median():.0f}")
print(f"  Min: {df['problem_length'].min():.0f}")
print(f"  Max: {df['problem_length'].max():.0f}")

DATASET STATISTICS

Total instances: 19,008
Verified instances: 0
Number of repositories: 35

Top 10 repositories by instance count:
  pandas-dev/pandas: 5,049 (0 verified)
  Qiskit/qiskit: 1,406 (0 verified)
  huggingface/transformers: 1,058 (0 verified)
  mesonbuild/meson: 954 (0 verified)
  numpy/numpy: 937 (0 verified)
  googleapis/google-cloud-python: 926 (0 verified)
  pantsbuild/pants: 900 (0 verified)
  conan-io/conan: 855 (0 verified)
  ipython/ipython: 850 (0 verified)
  pypa/pip: 686 (0 verified)

Verified instances by repository:

Problem statement length (chars):
  Mean: 1944
  Median: 1073
  Min: 10
  Max: 256368


---
## Section 2: Embedding Generation

In [5]:
def prepare_text(row):
    """Combine problem statement with hints if available.

    Uses the CodeRankEmbed prefix for optimal embedding quality.
    """
    problem_stmt = row['problem_statement'].strip()
    hints = row.get('hints_text', '')

    if hints and isinstance(hints, str) and len(hints) > 0:
        combined = f"{problem_stmt}\n\nHints:\n{hints}"
    else:
        combined = problem_stmt

    # Add CodeRankEmbed required prefix
    return f"Represent this query for searching relevant code: {combined}"

# Prepare texts
print("Preparing texts for embedding...")
texts = [prepare_text(row) for _, row in df.iterrows()]
print(f"Prepared {len(texts)} texts")

# Check text lengths
text_lengths = [len(t) for t in texts]
print(f"\nText length stats (chars):")
print(f"  Mean: {np.mean(text_lengths):.0f}")
print(f"  Max: {np.max(text_lengths):.0f}")

Preparing texts for embedding...
Prepared 19008 texts

Text length stats (chars):
  Mean: 3676
  Max: 915046


In [6]:
# Check if embeddings already exist
embeddings_path = ARTIFACTS_DIR / "embeddings_768d.npy"

if embeddings_path.exists():
    print(f"Loading cached embeddings from {embeddings_path}...")
    embeddings = np.load(embeddings_path)
    print(f"Loaded embeddings: {embeddings.shape}")
else:
    # Load CodeRankEmbed model
    print("Loading CodeRankEmbed model...")
    model = SentenceTransformer(
        "nomic-ai/CodeRankEmbed",
        trust_remote_code=True,
        device=device
    )

    # Set to FP16 for faster inference on GPU
    if device == "cuda":
        model = model.half()

    print(f"Model loaded: {model.get_sentence_embedding_dimension()}-D embeddings")
    print(f"Max sequence length: {model.max_seq_length} tokens")

Loading CodeRankEmbed model...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_hf_nomic_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/CodeRankEmbed:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/CodeRankEmbed:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Model loaded: 768-D embeddings
Max sequence length: 8192 tokens


In [7]:
# Generate embeddings in chunks (if not cached)
if not embeddings_path.exists():
    CHUNK_SIZE = 4000
    BATCH_SIZE = 2  # Small batch for memory safety

    print(f"\nGenerating embeddings for {len(texts)} samples...")
    print(f"Processing in chunks of {CHUNK_SIZE}")

    all_embeddings = []
    start_time = time.time()

    for i in range(0, len(texts), CHUNK_SIZE):
        chunk_texts = texts[i:i+CHUNK_SIZE]
        chunk_start = time.time()

        # Encode chunk
        chunk_embeddings = model.encode(
            chunk_texts,
            batch_size=BATCH_SIZE,
            show_progress_bar=True,
            normalize_embeddings=True,  # L2 normalization
            convert_to_numpy=True
        )

        all_embeddings.append(chunk_embeddings)

        chunk_time = time.time() - chunk_start
        samples_per_sec = len(chunk_texts) / chunk_time
        print(f"Chunk {i//CHUNK_SIZE + 1}: {len(chunk_texts)} samples in {chunk_time:.1f}s ({samples_per_sec:.1f} samples/s)")

        # Clear GPU cache
        if device == "cuda":
            torch.cuda.empty_cache()

    # Concatenate all embeddings
    embeddings = np.vstack(all_embeddings)
    total_time = time.time() - start_time

    print(f"\nTotal embedding generation:")
    print(f"  Time: {total_time/60:.1f} minutes")
    print(f"  Shape: {embeddings.shape}")
    print(f"  Memory: {embeddings.nbytes / 1e6:.1f} MB")

    # Save embeddings
    np.save(embeddings_path, embeddings)
    print(f"\nSaved embeddings to {embeddings_path}")

    # Clean up model
    del model
    if device == "cuda":
        torch.cuda.empty_cache()


Generating embeddings for 19008 samples...
Processing in chunks of 4000


Batches:   0%|          | 0/2000 [00:00<?, ?it/s]

Chunk 1: 4000 samples in 200.7s (19.9 samples/s)


Batches:   0%|          | 0/2000 [00:00<?, ?it/s]

Chunk 2: 4000 samples in 205.9s (19.4 samples/s)


Batches:   0%|          | 0/2000 [00:00<?, ?it/s]

Chunk 3: 4000 samples in 198.4s (20.2 samples/s)


Batches:   0%|          | 0/2000 [00:00<?, ?it/s]

Chunk 4: 4000 samples in 205.0s (19.5 samples/s)


Batches:   0%|          | 0/1504 [00:00<?, ?it/s]

Chunk 5: 3008 samples in 157.1s (19.1 samples/s)

Total embedding generation:
  Time: 16.1 minutes
  Shape: (19008, 768)
  Memory: 29.2 MB

Saved embeddings to artifacts/embeddings_768d.npy


---
## Section 3: UMAP Dimensionality Reduction

In [8]:
# Check for cached UMAP embeddings
umap_3d_path = ARTIFACTS_DIR / "umap_3d.npy"
umap_2d_path = ARTIFACTS_DIR / "umap_2d.npy"

if umap_3d_path.exists() and umap_2d_path.exists():
    print("Loading cached UMAP embeddings...")
    embeddings_3d = np.load(umap_3d_path)
    embeddings_2d = np.load(umap_2d_path)
    print(f"3D shape: {embeddings_3d.shape}")
    print(f"2D shape: {embeddings_2d.shape}")
else:
    # UMAP 3D (for clustering)
    print("Applying UMAP 3D reduction...")
    start_time = time.time()

    reducer_3d = umap.UMAP(
        n_components=3,
        n_neighbors=15,
        min_dist=0.1,
        metric='cosine',
        random_state=RANDOM_SEED,
        n_jobs=-1
    )

    embeddings_3d = reducer_3d.fit_transform(embeddings)
    print(f"UMAP 3D completed in {time.time() - start_time:.1f}s")
    print(f"Shape: {embeddings_3d.shape}")

    # Save 3D embeddings
    np.save(umap_3d_path, embeddings_3d)
    print(f"Saved to {umap_3d_path}")

Applying UMAP 3D reduction...
UMAP 3D completed in 39.1s
Shape: (19008, 3)
Saved to artifacts/umap_3d.npy


In [9]:
if not umap_2d_path.exists():
    # UMAP 2D (for visualization)
    print("\nApplying UMAP 2D reduction...")
    start_time = time.time()

    reducer_2d = umap.UMAP(
        n_components=2,
        n_neighbors=15,
        min_dist=0.1,
        metric='cosine',
        random_state=RANDOM_SEED,
        n_jobs=-1
    )

    embeddings_2d = reducer_2d.fit_transform(embeddings)
    print(f"UMAP 2D completed in {time.time() - start_time:.1f}s")
    print(f"Shape: {embeddings_2d.shape}")

    # Save 2D embeddings
    np.save(umap_2d_path, embeddings_2d)
    print(f"Saved to {umap_2d_path}")


Applying UMAP 2D reduction...
UMAP 2D completed in 17.5s
Shape: (19008, 2)
Saved to artifacts/umap_2d.npy


---
## Section 4: 3D Visualization (Pre-Clustering)

In [10]:
# Add UMAP coordinates to DataFrame
df['umap_x'] = embeddings_2d[:, 0]
df['umap_y'] = embeddings_2d[:, 1]
df['umap_3d_x'] = embeddings_3d[:, 0]
df['umap_3d_y'] = embeddings_3d[:, 1]
df['umap_3d_z'] = embeddings_3d[:, 2]

In [11]:
# 3D Visualization colored by Repository
print("Creating 3D visualization (by repository)...")

fig = px.scatter_3d(
    df,
    x='umap_3d_x',
    y='umap_3d_y',
    z='umap_3d_z',
    color='repo',
    hover_data=['instance_id', 'is_verified'],
    title='SWE-bench Embeddings by Repository (3D UMAP)',
    labels={'umap_3d_x': 'UMAP 1', 'umap_3d_y': 'UMAP 2', 'umap_3d_z': 'UMAP 3'},
    height=800,
    opacity=0.7
)

fig.update_traces(marker=dict(size=2))
fig.update_layout(
    scene=dict(
        xaxis_title='UMAP 1',
        yaxis_title='UMAP 2',
        zaxis_title='UMAP 3'
    )
)

# Save
viz_repo_path = ARTIFACTS_DIR / "viz_by_repository_3d.html"
fig.write_html(viz_repo_path)
print(f"Saved to {viz_repo_path}")

fig.show()

Creating 3D visualization (by repository)...
Saved to artifacts/viz_by_repository_3d.html


In [12]:
# 3D Visualization colored by Verified status
print("Creating 3D visualization (by verified status)...")

df['verified_label'] = df['is_verified'].apply(lambda x: 'Verified' if x else 'Not Verified')

fig = px.scatter_3d(
    df,
    x='umap_3d_x',
    y='umap_3d_y',
    z='umap_3d_z',
    color='verified_label',
    color_discrete_map={'Verified': '#FF6B6B', 'Not Verified': '#4ECDC4'},
    hover_data=['instance_id', 'repo'],
    title=f'SWE-bench Embeddings: Verified vs Non-Verified ({verified_count} verified)',
    labels={'umap_3d_x': 'UMAP 1', 'umap_3d_y': 'UMAP 2', 'umap_3d_z': 'UMAP 3'},
    height=800,
    opacity=0.7
)

fig.update_traces(marker=dict(size=3))
fig.update_layout(
    scene=dict(
        xaxis_title='UMAP 1',
        yaxis_title='UMAP 2',
        zaxis_title='UMAP 3'
    )
)

# Save
viz_verified_path = ARTIFACTS_DIR / "viz_by_verified_3d.html"
fig.write_html(viz_verified_path)
print(f"Saved to {viz_verified_path}")

fig.show()

Creating 3D visualization (by verified status)...
Saved to artifacts/viz_by_verified_3d.html


---
## Section 5: Multi-Algorithm Clustering with Parameter Sweep

In [13]:
def calculate_metrics(data, labels):
    """Calculate clustering quality metrics."""
    # Filter out noise points (label = -1)
    mask = labels >= 0

    if mask.sum() < 2:
        return {
            'silhouette': np.nan,
            'davies_bouldin': np.nan,
            'calinski_harabasz': np.nan,
            'n_clusters': 0,
            'n_noise': (~mask).sum(),
            'noise_pct': (~mask).mean() * 100
        }

    filtered_data = data[mask]
    filtered_labels = labels[mask]

    n_clusters = len(np.unique(filtered_labels))

    # Need at least 2 clusters for metrics
    if n_clusters < 2:
        return {
            'silhouette': np.nan,
            'davies_bouldin': np.nan,
            'calinski_harabasz': np.nan,
            'n_clusters': n_clusters,
            'n_noise': (~mask).sum(),
            'noise_pct': (~mask).mean() * 100
        }

    return {
        'silhouette': silhouette_score(filtered_data, filtered_labels),
        'davies_bouldin': davies_bouldin_score(filtered_data, filtered_labels),
        'calinski_harabasz': calinski_harabasz_score(filtered_data, filtered_labels),
        'n_clusters': n_clusters,
        'n_noise': (~mask).sum(),
        'noise_pct': (~mask).mean() * 100
    }


def count_verified_per_cluster(labels, is_verified):
    """Count verified instances per cluster."""
    verified_counts = {}
    for cluster_id in np.unique(labels):
        if cluster_id == -1:  # Skip noise
            continue
        mask = labels == cluster_id
        verified_counts[int(cluster_id)] = int(is_verified[mask].sum())
    return verified_counts


def min_verified_per_cluster(labels, is_verified):
    """Get minimum verified count across all clusters."""
    counts = count_verified_per_cluster(labels, is_verified)
    if not counts:
        return 0
    return min(counts.values())


print("Evaluation functions ready!")

Evaluation functions ready!


In [45]:
# Define parameter grids for each algorithm
PARAM_GRIDS = {
    'HDBSCAN': {
        'min_cluster_size': [50, 100, 150, 200, 300, 400, 500, 750],
        'min_samples': [3, 5, 10, 15, 20],
        'cluster_selection_method': ['eom', 'leaf']
    },
    'K-Means': {
        'n_clusters': [3, 5, 7, 8, 10, 12, 15, 18, 20, 25, 30, 40]
    },
    'Agglomerative': {
        'n_clusters': [5, 8, 10, 12, 15, 18, 20, 25, 30],
        'linkage': ['ward', 'complete', 'average', 'single']
    },
    'GMM': {
        'n_components': [5, 8, 10, 12, 15, 18, 20, 25, 30],
        'covariance_type': ['full', 'tied', 'diag', 'spherical']
    },
    'Spectral': {
        'n_clusters': [5, 8, 10, 12, 15, 20, 25]
    }
  }

# Calculate total configurations
total_configs = 0
for algo, params in PARAM_GRIDS.items():
    n_configs = 1
    for values in params.values():
        n_configs *= len(values)
    print(f"{algo}: {n_configs} configurations")
    total_configs += n_configs

print(f"\nTotal configurations to test: {total_configs}")

HDBSCAN: 80 configurations
K-Means: 12 configurations
Agglomerative: 36 configurations
GMM: 36 configurations
Spectral: 7 configurations

Total configurations to test: 171


In [46]:
# Minimum verified instances per cluster threshold
MIN_VERIFIED_THRESHOLD = 3

# Store all results
all_results = []

# Normalize embeddings for cosine-based methods
embeddings_norm = normalize(embeddings, norm='l2')

is_verified_array = df['is_verified'].values

In [47]:
# 1. HDBSCAN
print("=" * 80)
print("Testing HDBSCAN...")
print("=" * 80)

params = list(product(
    PARAM_GRIDS['HDBSCAN']['min_cluster_size'],
    PARAM_GRIDS['HDBSCAN']['min_samples'],
    PARAM_GRIDS['HDBSCAN']['cluster_selection_method']
))

for min_size, min_samp, selection_method in tqdm(params, desc="HDBSCAN"):
    try:
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_size,
            min_samples=min_samp,
            metric='euclidean',
            cluster_selection_method=selection_method,
            core_dist_n_jobs=-1
        )

        labels = clusterer.fit_predict(embeddings_3d)
        metrics = calculate_metrics(embeddings_3d, labels)
        verified_counts = count_verified_per_cluster(labels, is_verified_array)
        min_verified = min_verified_per_cluster(labels, is_verified_array)

        all_results.append({
            'algorithm': 'HDBSCAN',
            'params': {
                'min_cluster_size': min_size,
                'min_samples': min_samp,
                'cluster_selection_method': selection_method
            },
            'labels': labels,
            **metrics,
            'verified_per_cluster': verified_counts,
            'min_verified': min_verified,
            'passes_threshold': min_verified >= MIN_VERIFIED_THRESHOLD
        })
    except Exception as e:
        print(f"Error with HDBSCAN ({min_size}, {min_samp}, {selection_method}): {e}")

print(f"HDBSCAN: {len([r for r in all_results if r['algorithm'] == 'HDBSCAN'])} configs tested")

Testing HDBSCAN...


HDBSCAN:   0%|          | 0/80 [00:00<?, ?it/s]

HDBSCAN: 80 configs tested


In [48]:
# 2. K-Means
print("=" * 80)
print("Testing K-Means...")
print("=" * 80)

for n_clusters in tqdm(PARAM_GRIDS['K-Means']['n_clusters'], desc="K-Means"):
    try:
        clusterer = KMeans(
            n_clusters=n_clusters,
            random_state=RANDOM_SEED,
            n_init=10,
            max_iter=300
        )

        labels = clusterer.fit_predict(embeddings_3d)
        metrics = calculate_metrics(embeddings_3d, labels)
        verified_counts = count_verified_per_cluster(labels, is_verified_array)
        min_verified = min_verified_per_cluster(labels, is_verified_array)

        all_results.append({
            'algorithm': 'K-Means',
            'params': {'n_clusters': n_clusters},
            'labels': labels,
            **metrics,
            'verified_per_cluster': verified_counts,
            'min_verified': min_verified,
            'passes_threshold': min_verified >= MIN_VERIFIED_THRESHOLD
        })
    except Exception as e:
        print(f"Error with K-Means (k={n_clusters}): {e}")

print(f"K-Means: {len([r for r in all_results if r['algorithm'] == 'K-Means'])} configs tested")

Testing K-Means...


K-Means:   0%|          | 0/12 [00:00<?, ?it/s]

K-Means: 12 configs tested


In [49]:
# 3. Agglomerative
print("=" * 80)
print("Testing Agglomerative Clustering...")
print("=" * 80)

params = list(product(
    PARAM_GRIDS['Agglomerative']['n_clusters'],
    PARAM_GRIDS['Agglomerative']['linkage']
))

for n_clusters, linkage in tqdm(params, desc="Agglomerative"):
    try:
        clusterer = AgglomerativeClustering(
            n_clusters=n_clusters,
            linkage=linkage
        )

        labels = clusterer.fit_predict(embeddings_3d)
        metrics = calculate_metrics(embeddings_3d, labels)
        verified_counts = count_verified_per_cluster(labels, is_verified_array)
        min_verified = min_verified_per_cluster(labels, is_verified_array)

        all_results.append({
            'algorithm': 'Agglomerative',
            'params': {'n_clusters': n_clusters, 'linkage': linkage},
            'labels': labels,
            **metrics,
            'verified_per_cluster': verified_counts,
            'min_verified': min_verified,
            'passes_threshold': min_verified >= MIN_VERIFIED_THRESHOLD
        })
    except Exception as e:
        print(f"Error with Agglomerative ({n_clusters}, {linkage}): {e}")

print(f"Agglomerative: {len([r for r in all_results if r['algorithm'] == 'Agglomerative'])} configs tested")

Testing Agglomerative Clustering...


Agglomerative:   0%|          | 0/36 [00:00<?, ?it/s]

Agglomerative: 36 configs tested


In [50]:
# 4. Gaussian Mixture Model
print("=" * 80)
print("Testing Gaussian Mixture Model...")
print("=" * 80)

params = list(product(
    PARAM_GRIDS['GMM']['n_components'],
    PARAM_GRIDS['GMM']['covariance_type']
))

for n_components, cov_type in tqdm(params, desc="GMM"):
    try:
        clusterer = GaussianMixture(
            n_components=n_components,
            covariance_type=cov_type,
            random_state=RANDOM_SEED,
            n_init=5
        )

        labels = clusterer.fit_predict(embeddings_3d)
        metrics = calculate_metrics(embeddings_3d, labels)
        verified_counts = count_verified_per_cluster(labels, is_verified_array)
        min_verified = min_verified_per_cluster(labels, is_verified_array)

        all_results.append({
            'algorithm': 'GMM',
            'params': {'n_components': n_components, 'covariance_type': cov_type},
            'labels': labels,
            **metrics,
            'verified_per_cluster': verified_counts,
            'min_verified': min_verified,
            'passes_threshold': min_verified >= MIN_VERIFIED_THRESHOLD
        })
    except Exception as e:
        print(f"Error with GMM ({n_components}, {cov_type}): {e}")

print(f"GMM: {len([r for r in all_results if r['algorithm'] == 'GMM'])} configs tested")

Testing Gaussian Mixture Model...


GMM:   0%|          | 0/36 [00:00<?, ?it/s]

GMM: 36 configs tested


In [51]:
# 5. Spectral Clustering
print("=" * 80)
print("Testing Spectral Clustering...")
print("=" * 80)

for n_clusters in tqdm(PARAM_GRIDS['Spectral']['n_clusters'], desc="Spectral"):
    try:
        clusterer = SpectralClustering(
            n_clusters=n_clusters,
            affinity='nearest_neighbors',
            n_neighbors=15,
            random_state=RANDOM_SEED,
            n_jobs=-1
        )

        labels = clusterer.fit_predict(embeddings_3d)
        metrics = calculate_metrics(embeddings_3d, labels)
        verified_counts = count_verified_per_cluster(labels, is_verified_array)
        min_verified = min_verified_per_cluster(labels, is_verified_array)

        all_results.append({
            'algorithm': 'Spectral',
            'params': {'n_clusters': n_clusters},
            'labels': labels,
            **metrics,
            'verified_per_cluster': verified_counts,
            'min_verified': min_verified,
            'passes_threshold': min_verified >= MIN_VERIFIED_THRESHOLD
        })
    except Exception as e:
        print(f"Error with Spectral (k={n_clusters}): {e}")

print(f"Spectral: {len([r for r in all_results if r['algorithm'] == 'Spectral'])} configs tested")

Testing Spectral Clustering...


Spectral:   0%|          | 0/7 [00:00<?, ?it/s]

Spectral: 7 configs tested


In [52]:
print(f"\nTotal configurations tested: {len(all_results)}")
print(f"Configurations passing threshold (min {MIN_VERIFIED_THRESHOLD} verified per cluster): {sum(r['passes_threshold'] for r in all_results)}")


Total configurations tested: 171
Configurations passing threshold (min 3 verified per cluster): 0


In [53]:
# Load SWE-bench Verified
print("Loading SWE-bench Verified...")
dataset_verified = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
df_verified = pd.DataFrame(dataset_verified)
print(f"Loaded {len(df_verified)} verified instances")

# Prepare texts
texts_verified = [prepare_text(row) for _, row in df_verified.iterrows()]

# Embed verified instances
print("Embedding verified instances...")
import gc
torch.cuda.empty_cache()
gc.collect()

model = SentenceTransformer("nomic-ai/CodeRankEmbed", trust_remote_code=True, device=device)
if device == "cuda":
    model = model.half()

# Reduce batch size to 1 to prevent OOM with long sequences (8192 tokens)
embeddings_verified = model.encode(
    texts_verified,
    batch_size=1,
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_numpy=True
)
print(f"Shape: {embeddings_verified.shape}")

# Clean up model to free memory
del model
torch.cuda.empty_cache()
gc.collect()

# Load existing UMAP reducer and transform verified embeddings
print("Projecting verified to UMAP space...")
# Note: We re-fit on training data to ensure we have the reducer object state
reducer_3d = umap.UMAP(
    n_components=3,
    n_neighbors=15,
    min_dist=0.1,
    metric='cosine',
    random_state=RANDOM_SEED
)
# Fit on train data (embeddings variable from previous cells)
reducer_3d.fit(embeddings)
verified_3d = reducer_3d.transform(embeddings_verified)
print(f"Verified 3D shape: {verified_3d.shape}")

Loading SWE-bench Verified...
Loaded 500 verified instances
Embedding verified instances...




Batches:   0%|          | 0/500 [00:00<?, ?it/s]

Shape: (500, 768)
Projecting verified to UMAP space...
Verified 3D shape: (500, 3)


In [54]:
# For each clustering result, assign verified instances to nearest cluster
from scipy.spatial.distance import cdist
from collections import Counter

print("Assigning verified instances to clusters...")

for result in all_results:
    labels = result['labels']

    # Compute cluster centroids in UMAP 3D space from the training data
    cluster_centroids = {}
    unique_labels = np.unique(labels)

    for c in unique_labels:
        if c == -1:
            continue
        mask = labels == c
        # Use embeddings_3d which aligns with labels
        cluster_centroids[c] = embeddings_3d[mask].mean(axis=0)

    if not cluster_centroids:
        result['verified_per_cluster'] = {}
        result['min_verified'] = 0
        result['passes_threshold'] = False
        continue

    # Prepare centroids for distance calculation
    # Sort keys to ensure alignment
    cluster_ids = sorted(cluster_centroids.keys())
    centroids_array = np.array([cluster_centroids[c] for c in cluster_ids])

    # Calculate distances from each verified instance (in UMAP space) to each cluster centroid
    distances = cdist(verified_3d, centroids_array, metric='euclidean')

    # Assign to nearest
    nearest_idx = distances.argmin(axis=1)
    assigned_clusters = [cluster_ids[i] for i in nearest_idx]

    # Count verified per cluster
    verified_counts = Counter(assigned_clusters)

    # Update result
    result['verified_per_cluster'] = {int(k): int(v) for k, v in verified_counts.items()}

    # Calculate min verified across ALL clusters (including those with 0 verified)
    counts_list = [verified_counts.get(c, 0) for c in cluster_ids]
    result['min_verified'] = min(counts_list) if counts_list else 0
    result['passes_threshold'] = result['min_verified'] >= MIN_VERIFIED_THRESHOLD

print("Done! Verified instances assigned to clusters.")
print(f"Configs passing threshold: {sum(r['passes_threshold'] for r in all_results)}")

Assigning verified instances to clusters...
Done! Verified instances assigned to clusters.
Configs passing threshold: 27


---
## Section 6: Cluster Evaluation & SWE-Verified Distribution

In [55]:
# Filter results that pass the threshold
valid_results = [r for r in all_results if r['passes_threshold']]

# Fallback logic: if no results pass, use all results
if not valid_results:
    print(f"WARNING: No configurations passed the threshold ({len(valid_results)}/{len(all_results)}).")
    print("Falling back to ALL results for comparison to allow analysis to proceed.")
    valid_results = all_results

print(f"Configurations selected: {len(valid_results)} / {len(all_results)}")
print(f"\nBy algorithm:")
for algo in ['HDBSCAN', 'K-Means', 'Agglomerative', 'GMM', 'Spectral']:
    total = len([r for r in all_results if r['algorithm'] == algo])
    valid = len([r for r in valid_results if r['algorithm'] == algo])
    print(f"  {algo}: {valid}/{total} selected")

Configurations selected: 27 / 171

By algorithm:
  HDBSCAN: 7/80 selected
  K-Means: 5/12 selected
  Agglomerative: 6/36 selected
  GMM: 8/36 selected
  Spectral: 1/7 selected


In [56]:
# Create comparison DataFrame
comparison_data = []

for i, result in enumerate(valid_results):
    params_str = ', '.join([f"{k}={v}" for k, v in result['params'].items()])

    comparison_data.append({
        'index': i,
        'Algorithm': result['algorithm'],
        'Parameters': params_str,
        'N Clusters': result['n_clusters'],
        'Noise %': f"{result['noise_pct']:.1f}%",
        'Silhouette': result['silhouette'],
        'Davies-Bouldin': result['davies_bouldin'],
        'Min Verified': result['min_verified'],
        'Total Verified': sum(result['verified_per_cluster'].values())
    })

comparison_df = pd.DataFrame(comparison_data)

if not comparison_df.empty:
    # Sort by silhouette score (higher is better)
    comparison_df = comparison_df.sort_values('Silhouette', ascending=False).reset_index(drop=True)

    print("=" * 100)
    print("VALID CLUSTERING CONFIGURATIONS (sorted by Silhouette Score)")
    print("=" * 100)
    print(comparison_df.to_string(index=False))
    print("=" * 100)
else:
    print("No valid configurations to display.")

VALID CLUSTERING CONFIGURATIONS (sorted by Silhouette Score)
 index     Algorithm                                                          Parameters  N Clusters Noise %  Silhouette  Davies-Bouldin  Min Verified  Total Verified
    22           GMM                                n_components=8, covariance_type=full           8    0.0%    0.544435        0.740415             4             500
     3       HDBSCAN min_cluster_size=750, min_samples=10, cluster_selection_method=leaf           9   19.1%    0.534205        0.728177             4             500
     5       HDBSCAN min_cluster_size=750, min_samples=15, cluster_selection_method=leaf          10   13.0%    0.526936        0.772761             4             500
     9       K-Means                                                        n_clusters=7           7    0.0%    0.523556        0.795365             5             500
    25           GMM                           n_components=8, covariance_type=spherical           8    

In [59]:
# Save all results to JSON (without numpy arrays)
results_for_json = []
for r in all_results:
    result_copy = r.copy()
    result_copy['labels'] = result_copy['labels'].tolist()  # Convert numpy to list
    results_for_json.append(result_copy)

results_path = ARTIFACTS_DIR / "clustering_results.json"
with open(results_path, 'w') as f:
    json.dump(results_for_json, f, indent=2, default=float)

print(f"Saved all results to {results_path}")

Saved all results to artifacts/clustering_results.json


---
## Section 7: Interactive Cluster Selection

In [62]:
# Display top 10 configurations for selection
print("\nTOP 10 CONFIGURATIONS:")
print("=" * 100)

top_n = min(50, len(comparison_df))
for i in range(top_n):
    row = comparison_df.iloc[i]
    print(f"\n[{i}] {row['Algorithm']} | {row['Parameters']}")
    print(f"    Clusters: {row['N Clusters']} | Silhouette: {row['Silhouette']:.4f} | Min Verified: {row['Min Verified']}")

print("\n" + "=" * 100)


TOP 10 CONFIGURATIONS:

[0] GMM | n_components=8, covariance_type=full
    Clusters: 8 | Silhouette: 0.5444 | Min Verified: 4

[1] HDBSCAN | min_cluster_size=750, min_samples=10, cluster_selection_method=leaf
    Clusters: 9 | Silhouette: 0.5342 | Min Verified: 4

[2] HDBSCAN | min_cluster_size=750, min_samples=15, cluster_selection_method=leaf
    Clusters: 10 | Silhouette: 0.5269 | Min Verified: 4

[3] K-Means | n_clusters=7
    Clusters: 7 | Silhouette: 0.5236 | Min Verified: 5

[4] GMM | n_components=8, covariance_type=spherical
    Clusters: 8 | Silhouette: 0.5194 | Min Verified: 4

[5] K-Means | n_clusters=5
    Clusters: 5 | Silhouette: 0.5185 | Min Verified: 15

[6] K-Means | n_clusters=8
    Clusters: 8 | Silhouette: 0.5174 | Min Verified: 3

[7] Agglomerative | n_clusters=8, linkage=ward
    Clusters: 8 | Silhouette: 0.5161 | Min Verified: 4

[8] Agglomerative | n_clusters=5, linkage=average
    Clusters: 5 | Silhouette: 0.5153 | Min Verified: 15

[9] GMM | n_components=5, c

In [63]:

# User selects configuration
# Change this value to select a different configuration
SELECTED_INDEX = 3  # <-- CHANGE THIS TO SELECT DIFFERENT CONFIG

# Get the original result index
original_idx = comparison_df.iloc[SELECTED_INDEX]['index']
selected_result = valid_results[original_idx]

print(f"\nSELECTED CONFIGURATION:")
print(f"  Algorithm: {selected_result['algorithm']}")
print(f"  Parameters: {selected_result['params']}")
print(f"  N Clusters: {selected_result['n_clusters']}")
print(f"  Silhouette: {selected_result['silhouette']:.4f}")
print(f"  Noise %: {selected_result['noise_pct']:.1f}%")
print(f"\nVerified per cluster:")
for cluster_id, count in sorted(selected_result['verified_per_cluster'].items()):
    print(f"  Cluster {cluster_id}: {count} verified")


SELECTED CONFIGURATION:
  Algorithm: K-Means
  Parameters: {'n_clusters': 7}
  N Clusters: 7
  Silhouette: 0.5236
  Noise %: 0.0%

Verified per cluster:
  Cluster 0: 109 verified
  Cluster 1: 44 verified
  Cluster 2: 194 verified
  Cluster 3: 15 verified
  Cluster 4: 15 verified
  Cluster 5: 5 verified
  Cluster 6: 118 verified


In [64]:
# 3D Visualization of selected clustering
selected_labels = selected_result['labels']

df['cluster'] = selected_labels
df['cluster_label'] = df['cluster'].apply(lambda x: f"Cluster {x}" if x >= 0 else "Noise")

print("Creating 3D visualization of selected clustering...")

fig = px.scatter_3d(
    df,
    x='umap_3d_x',
    y='umap_3d_y',
    z='umap_3d_z',
    color='cluster_label',
    hover_data=['instance_id', 'repo', 'is_verified'],
    title=f"{selected_result['algorithm']} Clustering ({selected_result['n_clusters']} clusters)",
    labels={'umap_3d_x': 'UMAP 1', 'umap_3d_y': 'UMAP 2', 'umap_3d_z': 'UMAP 3'},
    height=800,
    opacity=0.7
)

fig.update_traces(marker=dict(size=3))
fig.update_layout(
    scene=dict(
        xaxis_title='UMAP 1',
        yaxis_title='UMAP 2',
        zaxis_title='UMAP 3'
    )
)

# Save
viz_clusters_path = ARTIFACTS_DIR / "viz_selected_clusters_3d.html"
fig.write_html(viz_clusters_path)
print(f"Saved to {viz_clusters_path}")

fig.show()

Creating 3D visualization of selected clustering...
Saved to artifacts/viz_selected_clusters_3d.html


In [65]:
# Cluster size distribution
cluster_sizes = df[df['cluster'] >= 0]['cluster'].value_counts().sort_index()

print("\nCluster Size Distribution:")
print("=" * 60)
for cluster_id, size in cluster_sizes.items():
    verified = selected_result['verified_per_cluster'].get(cluster_id, 0)
    print(f"  Cluster {cluster_id}: {size:,} instances ({verified} verified)")

print(f"\n  Noise: {(df['cluster'] == -1).sum():,} instances")


Cluster Size Distribution:
  Cluster 0: 3,571 instances (109 verified)
  Cluster 1: 2,215 instances (44 verified)
  Cluster 2: 5,167 instances (194 verified)
  Cluster 3: 1,840 instances (15 verified)
  Cluster 4: 1,295 instances (15 verified)
  Cluster 5: 1,936 instances (5 verified)
  Cluster 6: 2,984 instances (118 verified)

  Noise: 0 instances


---
## Section 8: Centroid Computation & Export

In [66]:
# Compute cluster centroids
print("Computing cluster centroids...")

cluster_centers_768d = {}
cluster_centers_umap3d = {}
cluster_stats = {}

for cluster_id in sorted(np.unique(selected_labels)):
    if cluster_id == -1:  # Skip noise
        continue

    mask = selected_labels == cluster_id

    # Compute centroid in original 768-D space
    cluster_embeddings = embeddings[mask]
    centroid_768d = cluster_embeddings.mean(axis=0)
    cluster_centers_768d[int(cluster_id)] = centroid_768d.tolist()

    # Compute centroid in UMAP 3D space
    cluster_umap = embeddings_3d[mask]
    centroid_umap = cluster_umap.mean(axis=0)
    cluster_centers_umap3d[int(cluster_id)] = centroid_umap.tolist()

    # Compute stats
    distances = np.linalg.norm(cluster_umap - centroid_umap, axis=1)
    cluster_stats[int(cluster_id)] = {
        'size': int(mask.sum()),
        'verified_count': int(df[mask]['is_verified'].sum()),
        'avg_distance_to_center': float(distances.mean()),
        'std_distance_to_center': float(distances.std()),
        'repos': df[mask]['repo'].value_counts().to_dict()
    }

print(f"Computed centroids for {len(cluster_centers_768d)} clusters")

Computing cluster centroids...
Computed centroids for 7 clusters


In [67]:
# Create final export structure
export_data = {
    'metadata': {
        'created_at': datetime.now().isoformat(),
        'dataset': 'princeton-nlp/SWE-bench',
        'dataset_size': len(df),
        'verified_count': int(verified_count),
        'embedding_model': 'nomic-ai/CodeRankEmbed',
        'embedding_dim': 768,
        'umap_params': {
            'n_components': 3,
            'n_neighbors': 15,
            'min_dist': 0.1,
            'metric': 'cosine'
        },
        'min_verified_threshold': MIN_VERIFIED_THRESHOLD
    },
    'algorithm': selected_result['algorithm'],
    'params': selected_result['params'],
    'metrics': {
        'n_clusters': selected_result['n_clusters'],
        'silhouette': float(selected_result['silhouette']),
        'davies_bouldin': float(selected_result['davies_bouldin']),
        'calinski_harabasz': float(selected_result['calinski_harabasz']),
        'noise_pct': float(selected_result['noise_pct'])
    },
    'cluster_centers_768d': cluster_centers_768d,
    'cluster_centers_umap3d': cluster_centers_umap3d,
    'cluster_stats': cluster_stats,
    'verified_per_cluster': selected_result['verified_per_cluster']
}

# Save to JSON
centroids_path = ARTIFACTS_DIR / "selected_cluster_centroids.json"
with open(centroids_path, 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"Saved cluster centroids to {centroids_path}")

Saved cluster centroids to artifacts/selected_cluster_centroids.json


In [68]:
# Save cluster assignments
cluster_assignments = {
    'instance_ids': df['instance_id'].tolist(),
    'cluster_labels': selected_labels.tolist(),
    'is_verified': df['is_verified'].tolist()
}

assignments_path = ARTIFACTS_DIR / "cluster_assignments.json"
with open(assignments_path, 'w') as f:
    json.dump(cluster_assignments, f, indent=2)

print(f"Saved cluster assignments to {assignments_path}")

Saved cluster assignments to artifacts/cluster_assignments.json


In [69]:
# Final Summary
print("\n" + "=" * 80)
print("CLUSTERING COMPLETE - SUMMARY")
print("=" * 80)

print(f"\nDataset:")
print(f"  Total instances: {len(df):,}")
print(f"  Verified instances: {verified_count:,}")
print(f"  Repositories: {df['repo'].nunique()}")

print(f"\nSelected Clustering:")
print(f"  Algorithm: {selected_result['algorithm']}")
print(f"  Parameters: {selected_result['params']}")
print(f"  Clusters: {selected_result['n_clusters']}")
print(f"  Silhouette: {selected_result['silhouette']:.4f}")
print(f"  Min verified per cluster: {selected_result['min_verified']}")

print(f"\nArtifacts saved to {ARTIFACTS_DIR}:")
for f in sorted(ARTIFACTS_DIR.glob('*')):
    size = f.stat().st_size / 1024  # KB
    if size > 1024:
        size_str = f"{size/1024:.1f} MB"
    else:
        size_str = f"{size:.1f} KB"
    print(f"  - {f.name} ({size_str})")

print("\n" + "=" * 80)
print("DONE")
print("=" * 80)


CLUSTERING COMPLETE - SUMMARY

Dataset:
  Total instances: 19,008
  Verified instances: 0
  Repositories: 35

Selected Clustering:
  Algorithm: K-Means
  Parameters: {'n_clusters': 7}
  Clusters: 7
  Silhouette: 0.5236
  Min verified per cluster: 5

Artifacts saved to artifacts:
  - .ipynb_checkpoints (4.0 KB)
  - cluster_assignments.json (904.7 KB)
  - clustering_results.json (29.0 MB)
  - embeddings_768d.npy (27.8 MB)
  - selected_cluster_centroids.json (148.7 KB)
  - umap_2d.npy (148.6 KB)
  - umap_3d.npy (222.9 KB)
  - viz_by_repository_3d.html (5.5 MB)
  - viz_by_verified_3d.html (5.8 MB)
  - viz_selected_clusters_3d.html (5.9 MB)

DONE


In [70]:
# Cell: Map Verified Instances to Clusters

import json
import numpy as np
import torch
from pathlib import Path
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist

# Define artifacts directory
ARTIFACTS_DIR = Path("artifacts")

# 1. Load centroids
centroids_path = ARTIFACTS_DIR / "selected_cluster_centroids.json"
if not centroids_path.exists():
    raise FileNotFoundError(f"Centroids file not found at {centroids_path}")

with open(centroids_path) as f:
    data = json.load(f)

# Dynamically get centroids (keys are strings "0", "1", etc.)
cluster_ids = sorted([int(k) for k in data["cluster_centers_768d"].keys()])
centroids = np.array([data["cluster_centers_768d"][str(i)] for i in cluster_ids])
print(f"Loaded {len(centroids)} centroids from {centroids_path}")

# 2. Load verified instances
verified = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")
print(f"Loaded {len(verified)} verified instances")

# 3. Embed with CodeRankEmbed
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = SentenceTransformer("nomic-ai/CodeRankEmbed", trust_remote_code=True, device=device)
if device == "cuda":
    model = model.half()

def prepare_text(inst):
    problem_stmt = inst['problem_statement'].strip()
    hints = inst.get('hints_text', '')
    if hints and isinstance(hints, str) and len(hints) > 0:
        combined = f"{problem_stmt}\n\nHints:\n{hints}"
    else:
        combined = problem_stmt
    return f"Represent this query for searching relevant code: {combined}"

texts = [prepare_text(inst) for inst in verified]

# Batch size 1 to avoid OOM on large texts
embeddings = model.encode(
    texts,
    show_progress_bar=True,
    batch_size=1,
    normalize_embeddings=True
)

# 4. Assign to nearest centroid (cosine distance)
distances = cdist(embeddings, centroids, metric='cosine')
nearest_indices = distances.argmin(axis=1)
cluster_labels = [cluster_ids[i] for i in nearest_indices]

# 5. Save verified assignments
verified_assignments = {
    "instance_ids": [inst["instance_id"] for inst in verified],
    "cluster_labels": cluster_labels
}

output_path = ARTIFACTS_DIR / "verified_cluster_assignments.json"
with open(output_path, "w") as f:
    json.dump(verified_assignments, f, indent=2)

print(f"Saved {len(verified_assignments['instance_ids'])} verified assignments to {output_path}")

unique, counts = np.unique(cluster_labels, return_counts=True)
print(f"Cluster distribution: {dict(zip(unique.tolist(), counts.tolist()))}")

Loaded 7 centroids from artifacts/selected_cluster_centroids.json
Loaded 500 verified instances
Using device: cuda




Batches:   0%|          | 0/500 [00:00<?, ?it/s]

Saved 500 verified assignments to artifacts/verified_cluster_assignments.json
Cluster distribution: {0: 125, 1: 46, 2: 137, 3: 38, 4: 49, 5: 10, 6: 95}
