# Part 3: Apply HDBSCAN Clustering

This notebook applies HDBSCAN clustering to the preprocessed text data.

## Input
- `preprocessed_data.pickle`: Vectorized text data from Part 2

## Output
- `clustered_data.pickle`: Data with cluster assignments and clustering results

In [None]:
# Install required packages for clustering
!pip install hdbscan scikit-learn matplotlib seaborn pandas numpy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import silhouette_score
import hdbscan
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

In [None]:
# Load preprocessed data from Part 2
print("📥 Loading preprocessed data from Part 2...")

try:
    with open('preprocessed_data.pickle', 'rb') as f:
        data = pickle.load(f)
    
    df = data['dataframe']
    processed_texts = data['processed_texts']
    original_texts = data['original_texts']
    X = data['tfidf_matrix']
    vectorizer = data['vectorizer']
    text_columns = data['text_columns']
    feature_names = data['feature_names']
    
    print(f"✅ Data loaded successfully!")
    print(f"📊 Data shape: {X.shape}")
    print(f"🔢 Number of responses: {len(processed_texts)}")
    print(f"🔤 Number of features: {len(feature_names)}")
    
except FileNotFoundError:
    print("❌ Preprocessed data file not found. Please run Part 2 first!")
    raise

# Convert sparse matrix to dense if needed
if hasattr(X, 'toarray'):
    X_dense = X.toarray()
    print("📦 Converted sparse matrix to dense for clustering")
else:
    X_dense = X
    print("📦 Using dense matrix for clustering")

print(f"\n🎯 Ready for clustering with {X_dense.shape[0]} samples and {X_dense.shape[1]} features")

In [None]:
# Apply HDBSCAN clustering
print("🎯 Applying HDBSCAN clustering...")

# Configure HDBSCAN parameters
min_cluster_size = max(2, len(processed_texts) // 10)  # Adaptive minimum cluster size
min_samples = max(1, min_cluster_size // 2)           # Minimum samples

print(f"⚙️ HDBSCAN parameters:")
print(f"   - min_cluster_size: {min_cluster_size}")
print(f"   - min_samples: {min_samples}")

# Create and fit HDBSCAN clusterer
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    metric='euclidean',
    cluster_selection_method='eom'
)

# Fit the model
cluster_labels = clusterer.fit_predict(X_dense)

# Add cluster labels to DataFrame
df['cluster'] = cluster_labels

# Analyze clustering results
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)

print(f"\n📊 Clustering Results:")
print(f"   🏷️  Number of clusters: {n_clusters}")
print(f"   🔇 Number of noise points: {n_noise}")
print(f"   📈 Percentage of noise: {n_noise/len(cluster_labels)*100:.1f}%")

# Cluster size distribution
cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()
print(f"\n📊 Cluster size distribution:")
for cluster_id, count in cluster_counts.items():
    if cluster_id == -1:
        print(f"   🔇 Noise: {count} responses")
    else:
        print(f"   🏷️  Cluster {cluster_id}: {count} responses")

# Calculate clustering quality metrics
if n_clusters > 1:
    # Silhouette score (excluding noise points)
    non_noise_mask = cluster_labels != -1
    if np.sum(non_noise_mask) > 1:
        silhouette_avg = silhouette_score(X_dense[non_noise_mask], 
                                        cluster_labels[non_noise_mask])
        print(f"\n📏 Silhouette Score: {silhouette_avg:.3f}")
    else:
        print(f"\n📏 Silhouette Score: Cannot calculate (insufficient non-noise samples)")
else:
    print(f"\n📏 Silhouette Score: Cannot calculate (insufficient clusters)")

In [None]:
# Visualize clustering results
if n_clusters > 0:
    print("📊 Creating visualizations...")
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Cluster size distribution
    cluster_counts_viz = cluster_counts[cluster_counts.index != -1]  # Exclude noise
    if len(cluster_counts_viz) > 0:
        axes[0, 0].bar(range(len(cluster_counts_viz)), cluster_counts_viz.values)
        axes[0, 0].set_title('Cluster Size Distribution')
        axes[0, 0].set_xlabel('Cluster ID')
        axes[0, 0].set_ylabel('Number of Responses')
        axes[0, 0].set_xticks(range(len(cluster_counts_viz)))
        axes[0, 0].set_xticklabels(cluster_counts_viz.index)
    
    # 2. Cluster membership pie chart
    cluster_summary = pd.Series(cluster_labels).value_counts().sort_index()
    colors = plt.cm.Set3(np.linspace(0, 1, len(cluster_summary)))
    labels = [f'Cluster {i}' if i != -1 else 'Noise' for i in cluster_summary.index]
    
    axes[0, 1].pie(cluster_summary.values, labels=labels, autopct='%1.1f%%', colors=colors)
    axes[0, 1].set_title('Cluster Distribution')
    
    # 3. Cluster membership probabilities
    if hasattr(clusterer, 'probabilities_'):
        axes[1, 0].hist(clusterer.probabilities_, bins=30, alpha=0.7)
        axes[1, 0].set_title('Cluster Membership Probabilities')
        axes[1, 0].set_xlabel('Probability')
        axes[1, 0].set_ylabel('Frequency')
    else:
        axes[1, 0].text(0.5, 0.5, 'Probabilities not available', 
                       ha='center', va='center', transform=axes[1, 0].transAxes)
        axes[1, 0].set_title('Cluster Probabilities')
    
    # 4. Response length by cluster
    response_lengths = [len(text.split()) for text in processed_texts]
    cluster_df_viz = pd.DataFrame({
        'cluster': cluster_labels,
        'length': response_lengths
    })
    
    for cluster_id in sorted(set(cluster_labels)):
        if cluster_id != -1:
            cluster_lengths = cluster_df_viz[cluster_df_viz['cluster'] == cluster_id]['length']
            if len(cluster_lengths) > 0:
                axes[1, 1].hist(cluster_lengths, alpha=0.6, label=f'Cluster {cluster_id}', bins=15)
    
    axes[1, 1].set_title('Response Length Distribution by Cluster')
    axes[1, 1].set_xlabel('Number of Words')
    axes[1, 1].set_ylabel('Frequency')
    if n_clusters > 0:
        axes[1, 1].legend()
    
    plt.tight_layout()
    plt.show()
    
else:
    print("⚠️ No clusters found - skipping visualizations")

In [None]:
# Save clustering results for Part 4
clustered_data = {
    'dataframe': df,
    'cluster_labels': cluster_labels,
    'clusterer': clusterer,
    'processed_texts': processed_texts,
    'original_texts': original_texts,
    'vectorizer': vectorizer,
    'tfidf_matrix': X_dense,
    'text_columns': text_columns,
    'feature_names': feature_names,
    'n_clusters': n_clusters,
    'n_noise': n_noise,
    'cluster_counts': cluster_counts
}

with open('clustered_data.pickle', 'wb') as f:
    pickle.dump(clustered_data, f)

print("✅ Clustering results saved!")
print("📁 File created: clustered_data.pickle")
print(f"🎯 Successfully clustered {len(processed_texts)} responses into {n_clusters} clusters")

if n_clusters > 0:
    print(f"📊 Cluster summary:")
    for cluster_id, count in cluster_counts.items():
        if cluster_id != -1:
            print(f"   🏷️  Cluster {cluster_id}: {count} responses ({count/len(cluster_labels)*100:.1f}%)")
    if n_noise > 0:
        print(f"   🔇 Noise: {n_noise} responses ({n_noise/len(cluster_labels)*100:.1f}%)")
else:
    print("⚠️ No clusters were found in the data")

print(f"\n➡️ Ready for Part 4: Summarization and Analysis")