# üéØ Hierarchical Clustering Optimization

**Based on findings from Notebook 04:**
- Best Model: **CodeBERT**
- Best Preprocessing: **Pure embeddings** (no PCA)
- Best Algorithm: **Agglomerative Single Linkage**
- Best K: **2**
- Baseline Silhouette: **0.5952**

## üî¨ Goal: Deep Dive into Hierarchical Clustering

Since Agglomerative clustering performed best, let's optimize:

### 1. Linkage Methods (Comprehensive)
- Single (current best: 0.5952)
- Complete
- Average
- Ward
- Weighted
- Centroid
- Median

### 2. Distance Metrics
- Cosine (current)
- Euclidean
- Manhattan
- Correlation

### 3. K Values (Focused Range)
- Fine-grained around K=2: [2, 3, 4, 5, 6, 8, 10, 12, 15]

### 4. Dendrogram Analysis
- Find natural cluster cutoffs
- Visualize hierarchical structure

### 5. Multi-Level Hierarchical Clustering
- Level 1: Coarse clusters
- Level 2: Fine-grained sub-clusters

---
**‚ö° GPU recommended for embeddings**

## 0. üîß Setup

In [None]:
# Install packages with version compatibility for Colab T4 GPU
# Colab now uses NumPy 2.x as default for many packages
!pip install -q --upgrade pip

# Use NumPy 2.x to avoid conflicts with Colab pre-installed packages
!pip install -q 'numpy>=2.0.0'

# Core ML packages - latest versions compatible with NumPy 2.x
!pip install -q 'transformers>=4.40.0'
!pip install -q 'datasets>=2.18.0'
!pip install -q 'scikit-learn>=1.4.0'
!pip install -q 'scipy>=1.12.0'

# Visualization packages
!pip install -q 'matplotlib>=3.8.0'
!pip install -q 'seaborn>=0.13.0'
!pip install -q 'pandas>=2.2.0'
!pip install -q 'umap-learn>=0.5.5'

# Verify installations
import torch
import numpy as np
import transformers
import sklearn

print(f'‚úÖ Packages installed!')
print(f'NumPy version: {np.__version__}')
print(f'PyTorch version: {torch.__version__}')
print(f'Transformers version: {transformers.__version__}')
print(f'scikit-learn version: {sklearn.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'CUDA version: {torch.version.cuda}')

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

if device == 'cuda':
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    props = torch.cuda.get_device_properties(0)
    print(f'Memory: {props.total_memory / 1e9:.1f} GB')
    print(f'Compute Capability: {props.major}.{props.minor}')
    
    # T4 GPU specific optimizations
    if 'T4' in torch.cuda.get_device_name(0):
        print('‚úÖ T4 GPU detected - optimized for mixed precision training')
        print('   Recommendation: Use batch_size=32-64 for best performance')
    
    # Clear cache to start fresh
    torch.cuda.empty_cache()
    print(f'Available memory: {torch.cuda.mem_get_info()[0] / 1e9:.1f} GB')
else:
    print('üíª CPU mode')
    print('   üí° Enable GPU: Runtime ‚Üí Change runtime type ‚Üí T4 GPU')

## 1. üì¶ Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
import json
import time
warnings.filterwarnings('ignore')

# Hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist, squareform

# Metrics
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import normalize

# Embeddings
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
import torch

# Viz
import umap
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

np.random.seed(42)

print('‚úÖ Imports complete!')

## 2. üì• Load Coding Datasets

In [None]:
def load_coding_datasets(max_total=4000):
    """
    Load diverse coding datasets.
    Same as notebook 03/04.
    """
    questions = []

    print("="*70)
    print("LOADING CODING DATASETS")
    print("="*70)

    # 1. SWE-bench
    print("\n1. Loading SWE-bench (GitHub issues)...")
    try:
        swe_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
        count = 0
        target = min(2000, len(swe_dataset))

        for idx, item in enumerate(swe_dataset):
            if count >= target:
                break

            problem = item.get("problem_statement", "")
            repo = item.get("repo", "")

            if "django" in repo.lower() or "flask" in repo.lower():
                domain = "web_framework"
            elif "sklearn" in repo.lower() or "pandas" in repo.lower() or "numpy" in repo.lower():
                domain = "data_science"
            elif "matplotlib" in repo.lower() or "seaborn" in repo.lower():
                domain = "visualization"
            elif "pytest" in repo.lower() or "test" in repo.lower():
                domain = "testing"
            elif "requests" in repo.lower() or "http" in repo.lower():
                domain = "networking"
            else:
                domain = "general"

            problem_lower = problem.lower()
            if "bug" in problem_lower or "fix" in problem_lower or "error" in problem_lower:
                task_type = "bug_fix"
            elif "test" in problem_lower:
                task_type = "testing"
            elif "refactor" in problem_lower or "clean" in problem_lower:
                task_type = "refactor"
            elif "add" in problem_lower or "implement" in problem_lower or "feature" in problem_lower:
                task_type = "feature"
            else:
                task_type = "general"

            if len(problem) < 200:
                complexity = "simple"
            elif len(problem) < 500:
                complexity = "medium"
            else:
                complexity = "complex"

            if problem:
                questions.append({
                    "question": problem,
                    "language": "python",
                    "domain": domain,
                    "task_type": task_type,
                    "complexity": complexity,
                    "source": f"swe_bench_{repo}"
                })
                count += 1

        print(f"   ‚úì Loaded {count} GitHub issues")

    except Exception as e:
        print(f"   ‚úó Error: {e}")

    # 2. DS-1000
    print("\n2. Loading DS-1000 (Data science tasks)...")
    try:
        ds_dataset = load_dataset("xlangai/DS-1000", split="test")
        count = 0

        for item in ds_dataset:
            prompt = item.get("prompt", "")
            metadata = item.get("metadata", {})
            library = metadata.get("library", "unknown") if isinstance(metadata, dict) else "unknown"

            if library in ["Numpy", "Pandas", "Scipy"]:
                domain = "data_manipulation"
            elif library in ["Matplotlib"]:
                domain = "visualization"
            elif library in ["Pytorch", "Tensorflow", "Sklearn"]:
                domain = "machine_learning"
            else:
                domain = "data_science"

            if len(prompt) < 150:
                complexity = "simple"
            elif len(prompt) < 300:
                complexity = "medium"
            else:
                complexity = "complex"

            if prompt:
                questions.append({
                    "question": prompt,
                    "language": "python",
                    "domain": domain,
                    "task_type": "code_generation",
                    "complexity": complexity,
                    "source": f"ds1000_{library.lower()}"
                })
                count += 1

        print(f"   ‚úì Loaded {count} data science tasks")

    except Exception as e:
        print(f"   ‚úó Error: {e}")

    # 3. BigCodeBench
    print("\n3. Loading BigCodeBench (API tasks)...")
    try:
        bigcode_dataset = load_dataset("bigcode/bigcodebench", split="v0.1.2")
        count = 0
        target = min(500, len(bigcode_dataset))

        for idx, item in enumerate(bigcode_dataset):
            if count >= target:
                break

            complete_prompt = item.get("complete_prompt", "")
            instruct_prompt = item.get("instruct_prompt", "")
            prompt = instruct_prompt if instruct_prompt else complete_prompt

            if len(prompt) < 200:
                complexity = "simple"
            elif len(prompt) < 400:
                complexity = "medium"
            else:
                complexity = "complex"

            if prompt:
                questions.append({
                    "question": prompt,
                    "language": "python",
                    "domain": "api_usage",
                    "task_type": "code_generation",
                    "complexity": complexity,
                    "source": "bigcodebench"
                })
                count += 1

        print(f"   ‚úì Loaded {count} API tasks")

    except Exception as e:
        print(f"   ‚úó Error: {e}")

    # 4. DebugBench
    print("\n4. Loading DebugBench (Debugging tasks)...")
    try:
        debug_dataset = load_dataset("Rtian/DebugBench", split="test")
        count = 0
        target = min(500, len(debug_dataset))

        for idx, item in enumerate(debug_dataset):
            if count >= target:
                break

            buggy_code = item.get("buggy_code", "")
            language = item.get("language", "python").lower()
            difficulty = item.get("difficulty", "medium").lower()

            complexity_map = {"easy": "simple", "medium": "medium", "hard": "complex"}
            complexity = complexity_map.get(difficulty, "medium")

            if buggy_code:
                questions.append({
                    "question": f"Debug this code:\n{buggy_code}",
                    "language": language,
                    "domain": "algorithms",
                    "task_type": "debugging",
                    "complexity": complexity,
                    "source": "debugbench"
                })
                count += 1

        print(f"   ‚úì Loaded {count} debugging tasks")

    except Exception as e:
        print(f"   ‚úó Error: {e}")

    print(f"\n{'='*70}")
    print(f"‚úÖ Total: {len(questions)} coding tasks")
    print(f"\nBreakdown:")
    print(f"  Languages: {Counter(q['language'] for q in questions)}")
    print(f"  Domains: {Counter(q['domain'] for q in questions)}")
    print(f"  Task Types: {Counter(q['task_type'] for q in questions)}")
    print(f"  Complexity: {Counter(q['complexity'] for q in questions)}")
    print(f"{'='*70}")

    return questions

# Load data
questions = load_coding_datasets(max_total=4000)
texts = [q['question'] for q in questions]

## 3. üß† Extract CodeBERT Embeddings (Winner from Notebook 04)

In [None]:
def mean_pooling(token_embeddings, attention_mask):
    """Mean pooling - take average of all tokens"""
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def encode_with_codebert(model, tokenizer, texts, device, batch_size=32):
    """Encode texts using CodeBERT"""
    model.eval()
    all_embeddings = []
    
    print(f'üöÄ Encoding {len(texts)} texts with CodeBERT...')
    
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            ).to(device)
            
            outputs = model(**encoded)
            embeddings = mean_pooling(outputs.last_hidden_state, encoded['attention_mask'])
            all_embeddings.append(embeddings.cpu().numpy())
            
            if (i // batch_size) % 10 == 0:
                print(f'  Processed {min(i+batch_size, len(texts))}/{len(texts)}', end='\r')
    
    print(f'  Processed {len(texts)}/{len(texts)} ‚úì')
    return np.vstack(all_embeddings)

print("="*70)
print("LOADING CODEBERT (Best Model from Notebook 04)")
print("="*70)

tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')
model = AutoModel.from_pretrained('microsoft/codebert-base').to(device)

codebert_embeddings = encode_with_codebert(model, tokenizer, texts, device, batch_size=32)
codebert_norm = normalize(codebert_embeddings, norm='l2')

print(f"\n‚úÖ CodeBERT embeddings: {codebert_norm.shape}")
print(f"   Mean: {codebert_norm.mean():.4f}, Std: {codebert_norm.std():.4f}")

# Free memory
del model
del tokenizer
if device == 'cuda':
    torch.cuda.empty_cache()

print("="*70)

## 4. üî¨ Comprehensive Hierarchical Clustering Experiments

### Experiment 1: All Linkage Methods with Sklearn

In [None]:
print("="*70)
print("EXPERIMENT 1: LINKAGE METHOD COMPARISON (Sklearn)")
print("="*70)
print("Testing: single, complete, average, ward")
print("K values: 2, 3, 4, 5, 6, 8, 10, 12, 15\n")

linkages = ['single', 'complete', 'average', 'ward']
k_values = [2, 3, 4, 5, 6, 8, 10, 12, 15]

linkage_results = []

for linkage in linkages:
    print(f"\nTesting linkage: {linkage}")
    
    for k in k_values:
        try:
            start_time = time.time()
            
            agg = AgglomerativeClustering(
                n_clusters=k,
                linkage=linkage,
                metric='euclidean' if linkage == 'ward' else 'cosine'
            )
            
            labels = agg.fit_predict(codebert_norm)
            
            # Calculate silhouette
            sil = silhouette_score(codebert_norm, labels, metric='cosine')
            
            elapsed = time.time() - start_time
            
            linkage_results.append({
                'linkage': linkage,
                'k': k,
                'silhouette': sil,
                'time_sec': elapsed
            })
            
            print(f"  K={k:2d}: Silhouette={sil:.6f} ({elapsed:.2f}s)")
            
        except Exception as e:
            print(f"  K={k:2d}: Failed - {str(e)[:50]}")

linkage_df = pd.DataFrame(linkage_results)

print(f"\n{'='*70}")
print("TOP 10 RESULTS")
print(f"{'='*70}")
print(linkage_df.nlargest(10, 'silhouette')[['linkage', 'k', 'silhouette', 'time_sec']])

best_linkage = linkage_df.loc[linkage_df['silhouette'].idxmax()]
print(f"\n‚úÖ Best: {best_linkage['linkage']} linkage, K={int(best_linkage['k'])}, Silhouette={best_linkage['silhouette']:.6f}")
print(f"   Baseline (from Notebook 04): single linkage, K=2, Silhouette=0.5952")
print(f"   Improvement: {(best_linkage['silhouette'] - 0.5952) / 0.5952 * 100:+.2f}%")

### Experiment 2: Scipy Linkage Methods (More Options)

In [None]:
print("="*70)
print("EXPERIMENT 2: SCIPY LINKAGE METHODS (Extended)")
print("="*70)
print("Testing: single, complete, average, weighted, centroid, median, ward")
print("Using scipy.cluster.hierarchy for more linkage options\n")

# Scipy linkage methods
scipy_linkages = ['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward']
k_values_focused = [2, 3, 4, 5, 6, 8, 10]

scipy_results = []

for link_method in scipy_linkages:
    print(f"\nTesting scipy linkage: {link_method}")
    
    try:
        # Compute linkage matrix
        if link_method == 'ward':
            # Ward requires euclidean distance
            Z = linkage(codebert_norm, method=link_method, metric='euclidean')
        else:
            # Use cosine for others
            Z = linkage(codebert_norm, method=link_method, metric='cosine')
        
        # Test different K values
        for k in k_values_focused:
            labels = fcluster(Z, k, criterion='maxclust')
            
            # Calculate silhouette
            sil = silhouette_score(codebert_norm, labels, metric='cosine')
            
            scipy_results.append({
                'linkage': link_method,
                'k': k,
                'silhouette': sil,
                'method': 'scipy'
            })
            
            print(f"  K={k:2d}: Silhouette={sil:.6f}")
            
    except Exception as e:
        print(f"  Failed: {str(e)[:50]}")

scipy_df = pd.DataFrame(scipy_results)

print(f"\n{'='*70}")
print("TOP 10 SCIPY RESULTS")
print(f"{'='*70}")
if len(scipy_df) > 0:
    print(scipy_df.nlargest(10, 'silhouette')[['linkage', 'k', 'silhouette']])
    
    best_scipy = scipy_df.loc[scipy_df['silhouette'].idxmax()]
    print(f"\n‚úÖ Best scipy: {best_scipy['linkage']} linkage, K={int(best_scipy['k'])}, Silhouette={best_scipy['silhouette']:.6f}")

### Experiment 3: Distance Metrics (For Compatible Linkages)

In [None]:
print("="*70)
print("EXPERIMENT 3: DISTANCE METRICS (Single Linkage Only)")
print("="*70)
print("Testing metrics: cosine, euclidean, manhattan, correlation")
print("Using best linkage: single, K values: 2, 3, 4, 5\n")

metrics = ['cosine', 'euclidean', 'manhattan', 'correlation']
k_values_metrics = [2, 3, 4, 5]

metric_results = []

for metric in metrics:
    print(f"\nTesting metric: {metric}")
    
    for k in k_values_metrics:
        try:
            agg = AgglomerativeClustering(
                n_clusters=k,
                linkage='single',
                metric=metric
            )
            
            labels = agg.fit_predict(codebert_norm)
            
            # Calculate silhouette with cosine for consistency
            sil = silhouette_score(codebert_norm, labels, metric='cosine')
            
            metric_results.append({
                'metric': metric,
                'k': k,
                'silhouette': sil
            })
            
            print(f"  K={k}: Silhouette={sil:.6f}")
            
        except Exception as e:
            print(f"  K={k}: Failed - {str(e)[:50]}")

metric_df = pd.DataFrame(metric_results)

print(f"\n{'='*70}")
print("METRIC COMPARISON")
print(f"{'='*70}")
if len(metric_df) > 0:
    print(metric_df.nlargest(10, 'silhouette'))
    
    best_metric = metric_df.loc[metric_df['silhouette'].idxmax()]
    print(f"\n‚úÖ Best metric: {best_metric['metric']}, K={int(best_metric['k'])}, Silhouette={best_metric['silhouette']:.6f}")

## 5. üìä Dendrogram Analysis

In [None]:
print("="*70)
print("DENDROGRAM ANALYSIS")
print("="*70)
print("Visualizing hierarchical structure with best linkage method\n")

# Use best linkage from experiments
best_overall_linkage = best_linkage['linkage']
print(f"Using linkage: {best_overall_linkage}")

# Compute linkage matrix
if best_overall_linkage == 'ward':
    Z = linkage(codebert_norm, method=best_overall_linkage, metric='euclidean')
else:
    Z = linkage(codebert_norm, method=best_overall_linkage, metric='cosine')

# Plot dendrogram (truncated for readability)
fig, ax = plt.subplots(figsize=(16, 8))

dendrogram(
    Z,
    truncate_mode='lastp',
    p=30,  # Show last 30 merges
    leaf_font_size=10,
    ax=ax
)

ax.set_title(f'Hierarchical Clustering Dendrogram ({best_overall_linkage} linkage)', fontsize=14, fontweight='bold')
ax.set_xlabel('Cluster Index', fontsize=12)
ax.set_ylabel('Distance', fontsize=12)
ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.7, label='Possible cutoff')
ax.legend()

plt.tight_layout()
plt.show()

print("\nüí° Dendrogram shows hierarchical relationships between clusters")
print("   Large vertical lines indicate good separation points")

## 6. üî∫ Multi-Level Hierarchical Clustering

In [None]:
print("="*70)
print("MULTI-LEVEL HIERARCHICAL CLUSTERING")
print("="*70)
print("Level 1: Coarse clustering (K=2-5)")
print("Level 2: Fine-grained sub-clustering within each Level 1 cluster\n")

# Level 1: Coarse clustering
best_k_coarse = int(best_linkage['k'])
print(f"Level 1: Using K={best_k_coarse} ({best_overall_linkage} linkage)")

agg_l1 = AgglomerativeClustering(
    n_clusters=best_k_coarse,
    linkage=best_overall_linkage,
    metric='euclidean' if best_overall_linkage == 'ward' else 'cosine'
)

labels_l1 = agg_l1.fit_predict(codebert_norm)
sil_l1 = silhouette_score(codebert_norm, labels_l1, metric='cosine')

print(f"Level 1 Silhouette: {sil_l1:.6f}\n")

# Level 2: Sub-cluster each Level 1 cluster
print("Level 2: Sub-clustering within each Level 1 cluster")

labels_l2 = np.zeros(len(codebert_norm), dtype=int)
cluster_offset = 0
l2_silhouettes = []

for l1_cluster_id in range(best_k_coarse):
    mask_l1 = labels_l1 == l1_cluster_id
    embeddings_l1 = codebert_norm[mask_l1]
    indices_l1 = np.where(mask_l1)[0]
    
    print(f"\n  Cluster {l1_cluster_id} ({len(embeddings_l1)} samples):")
    
    if len(embeddings_l1) < 20:
        print(f"    Too few samples, keeping as single cluster")
        labels_l2[indices_l1] = cluster_offset
        cluster_offset += 1
        continue
    
    # Test K=2,3,4 for sub-clustering
    best_sub_k = 2
    best_sub_sil = -1
    
    for sub_k in [2, 3, 4]:
        if len(embeddings_l1) < sub_k * 5:
            continue
        
        try:
            agg_l2 = AgglomerativeClustering(
                n_clusters=sub_k,
                linkage=best_overall_linkage,
                metric='euclidean' if best_overall_linkage == 'ward' else 'cosine'
            )
            
            labels_l2_temp = agg_l2.fit_predict(embeddings_l1)
            sil_l2_temp = silhouette_score(embeddings_l1, labels_l2_temp, metric='cosine')
            
            print(f"    Sub-K={sub_k}: Silhouette={sil_l2_temp:.4f}")
            
            if sil_l2_temp > best_sub_sil:
                best_sub_sil = sil_l2_temp
                best_sub_k = sub_k
        except:
            pass
    
    # Apply best sub-clustering
    agg_l2 = AgglomerativeClustering(
        n_clusters=best_sub_k,
        linkage=best_overall_linkage,
        metric='euclidean' if best_overall_linkage == 'ward' else 'cosine'
    )
    
    labels_l2_cluster = agg_l2.fit_predict(embeddings_l1)
    labels_l2[indices_l1] = labels_l2_cluster + cluster_offset
    cluster_offset += best_sub_k
    
    l2_silhouettes.append(best_sub_sil)
    print(f"    ‚úÖ Best sub-K: {best_sub_k}, Silhouette: {best_sub_sil:.4f}")

# Overall Level 2 silhouette
sil_l2_overall = silhouette_score(codebert_norm, labels_l2, metric='cosine')
sil_l2_weighted = np.mean(l2_silhouettes) if l2_silhouettes else 0

print(f"\n{'='*70}")
print("MULTI-LEVEL RESULTS")
print(f"{'='*70}")
print(f"Level 1 (K={best_k_coarse}): Silhouette={sil_l1:.6f}")
print(f"Level 2 (K={cluster_offset}): Overall Silhouette={sil_l2_overall:.6f}")
print(f"Level 2 Weighted Avg: {sil_l2_weighted:.6f}")
print(f"\nüí° Multi-level provides hierarchical organization for interpretability")

## 7. üìä Visualizations

In [None]:
# Heatmap: Linkage vs K
if len(linkage_df) > 0:
    pivot = linkage_df.pivot_table(values='silhouette', index='linkage', columns='k')
    
    fig, ax = plt.subplots(figsize=(14, 6))
    sns.heatmap(pivot, annot=True, fmt='.4f', cmap='RdYlGn', center=0.4, ax=ax, cbar_kws={'label': 'Silhouette Score'})
    ax.set_title('Hierarchical Clustering: Linkage Method vs K', fontsize=14, fontweight='bold')
    ax.set_xlabel('Number of Clusters (K)', fontsize=12)
    ax.set_ylabel('Linkage Method', fontsize=12)
    
    plt.tight_layout()
    plt.show()
else:
    print("No linkage results to visualize")

In [None]:
# UMAP visualization of best clustering
print("Running UMAP dimensionality reduction...")

reducer = umap.UMAP(
    n_components=2,
    random_state=42,
    n_neighbors=15,
    min_dist=0.1,
    metric='cosine'
)

embeddings_2d = reducer.fit_transform(codebert_norm)

# Get best clustering labels
best_k = int(best_linkage['k'])
agg_best = AgglomerativeClustering(
    n_clusters=best_k,
    linkage=best_overall_linkage,
    metric='euclidean' if best_overall_linkage == 'ward' else 'cosine'
)
labels_best = agg_best.fit_predict(codebert_norm)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Left: Best single-level clustering
scatter1 = axes[0].scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=labels_best,
    cmap='tab10',
    alpha=0.6,
    s=20
)
axes[0].set_title(
    f'Best Config: {best_overall_linkage} linkage, K={best_k}\nSilhouette={best_linkage["silhouette"]:.4f}',
    fontweight='bold',
    fontsize=12
)
axes[0].set_xlabel('UMAP 1')
axes[0].set_ylabel('UMAP 2')
plt.colorbar(scatter1, ax=axes[0], label='Cluster ID')

# Right: Multi-level clustering
scatter2 = axes[1].scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=labels_l2,
    cmap='tab20',
    alpha=0.6,
    s=20
)
axes[1].set_title(
    f'Multi-Level: L1={best_k_coarse}, L2={cluster_offset}\nSilhouette={sil_l2_overall:.4f}',
    fontweight='bold',
    fontsize=12
)
axes[1].set_xlabel('UMAP 1')
axes[1].set_ylabel('UMAP 2')
plt.colorbar(scatter2, ax=axes[1], label='Sub-Cluster ID')

plt.tight_layout()
plt.show()

print("‚úÖ UMAP visualization complete")

## 8. üíæ Export Results

In [None]:
# Combine all results
all_results = pd.concat([
    linkage_df,
    scipy_df if len(scipy_df) > 0 else pd.DataFrame(),
    metric_df if len(metric_df) > 0 else pd.DataFrame()
], ignore_index=True)

all_results.to_csv('hierarchical_clustering_results.csv', index=False)

# Best configuration
best_config = {
    'approach': 'hierarchical_clustering',
    'embedding_model': 'CodeBERT',
    'preprocessing': 'pure (no PCA)',
    'best_single_level': {
        'linkage': str(best_linkage['linkage']),
        'k': int(best_linkage['k']),
        'silhouette': float(best_linkage['silhouette']),
        'time_sec': float(best_linkage['time_sec'])
    },
    'best_multi_level': {
        'level1_k': int(best_k_coarse),
        'level2_total_k': int(cluster_offset),
        'level1_silhouette': float(sil_l1),
        'level2_silhouette': float(sil_l2_overall),
        'level2_weighted_avg': float(sil_l2_weighted)
    },
    'baseline_from_notebook04': {
        'linkage': 'single',
        'k': 2,
        'silhouette': 0.5952
    },
    'improvement_pct': float((best_linkage['silhouette'] - 0.5952) / 0.5952 * 100)
}

with open('hierarchical_best_config.json', 'w') as f:
    json.dump(best_config, f, indent=2)

print('‚úÖ Exported:')
print('  - hierarchical_clustering_results.csv')
print('  - hierarchical_best_config.json')

## 9. üìù Final Summary

In [None]:
print('='*80)
print('HIERARCHICAL CLUSTERING OPTIMIZATION - FINAL SUMMARY')
print('='*80)

print(f'\nüìä EXPERIMENTS CONDUCTED:')
print(f'  Experiment 1: Sklearn linkages (single, complete, average, ward)')
print(f'  Experiment 2: Scipy linkages (+ weighted, centroid, median)')
print(f'  Experiment 3: Distance metrics (cosine, euclidean, manhattan, correlation)')
print(f'  Experiment 4: Dendrogram analysis')
print(f'  Experiment 5: Multi-level hierarchical clustering')
print(f'  Total configurations tested: {len(all_results)}')

print(f'\nüèÜ BEST SINGLE-LEVEL CONFIGURATION:')
print(f"  Linkage: {best_linkage['linkage']}")
print(f"  K: {int(best_linkage['k'])}")
print(f"  Silhouette: {best_linkage['silhouette']:.6f}")
print(f"  Time: {best_linkage['time_sec']:.2f}s")

print(f'\nüî∫ BEST MULTI-LEVEL CONFIGURATION:')
print(f"  Level 1: K={best_k_coarse} (Silhouette={sil_l1:.6f})")
print(f"  Level 2: K={cluster_offset} (Silhouette={sil_l2_overall:.6f})")
print(f"  Weighted Avg: {sil_l2_weighted:.6f}")

print(f'\nüìà COMPARISON:')
print(f"  Notebook 04 baseline: 0.5952 (single, K=2)")
print(f"  This optimization: {best_linkage['silhouette']:.6f}")
improvement = (best_linkage['silhouette'] - 0.5952) / 0.5952 * 100
print(f"  Improvement: {improvement:+.2f}%")

print(f'\nüí° KEY FINDINGS:')
print(f"  1. Best linkage method: {best_linkage['linkage']}")
print(f"  2. Optimal K value: {int(best_linkage['k'])}")
if len(metric_df) > 0:
    best_metric_name = metric_df.loc[metric_df['silhouette'].idxmax()]['metric']
    print(f"  3. Best distance metric: {best_metric_name}")
print(f"  4. Multi-level clustering provides {cluster_offset} fine-grained clusters")
print(f"  5. Pure CodeBERT embeddings (no PCA) work best")

print(f'\nüéØ PRODUCTION RECOMMENDATION:')
if best_linkage['silhouette'] > 0.5952:
    print(f"  ‚úÖ Use optimized config: {best_linkage['linkage']} linkage, K={int(best_linkage['k'])}")
    print(f"     Expected silhouette: {best_linkage['silhouette']:.4f}")
else:
    print(f"  ‚ÑπÔ∏è  Baseline from Notebook 04 is competitive")
    print(f"     Consider: single linkage, K=2, Silhouette=0.5952")

print(f'\nüí° MULTI-LEVEL USE CASE:')
print(f"  Level 1 ({best_k_coarse} clusters): High-level categorization")
print(f"  Level 2 ({cluster_offset} clusters): Fine-grained routing")
print(f"  Benefit: Hierarchical interpretability for model selection")

print(f'\n{'='*80}')