# Unsupervised Learning Analysis - Heart Disease Dataset

This notebook demonstrates comprehensive unsupervised learning analysis using K-Means and Hierarchical clustering algorithms on the heart disease dataset.

## Objectives:
1. Perform K-Means clustering with optimal K selection
2. Apply Hierarchical clustering with dendrogram analysis
3. Evaluate clustering performance using multiple metrics
4. Compare clustering results with true labels
5. Visualize clustering patterns and relationships

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from clustering_analyzer import ClusteringAnalyzer

# Set style for better visualizations
plt.style.use('default')
sns.set_palette("husl")

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")

## 1. Data Loading and Preparation

In [None]:
# Initialize clustering analyzer
analyzer = ClusteringAnalyzer(random_state=42)

# Load datasets
pca_data, cleaned_data = analyzer.load_data(
    '../data/processed/heart_disease_pca.csv',
    '../data/processed/heart_disease_cleaned.csv'
)

print("\nPCA Data Info:")
print(pca_data.info())
print("\nCleaned Data Info:")
print(cleaned_data.info())

In [None]:
# Prepare data for clustering
X_pca, y_true = analyzer.prepare_clustering_data(pca_data, use_pca=True)
X_original, _ = analyzer.prepare_clustering_data(cleaned_data, use_pca=False)

print(f"PCA features shape: {X_pca.shape}")
print(f"Original features shape: {X_original.shape}")
print(f"Target distribution: {np.bincount(y_true)}")
print(f"Target classes: {np.unique(y_true)}")

## 2. K-Means Clustering Analysis

In [None]:
# Perform K-Means clustering with elbow method
print("Performing K-Means clustering analysis...")
kmeans_results = analyzer.kmeans_clustering(X_pca, k_range=(2, 8))

# Display results
print(f"\nOptimal K (Elbow Method): {kmeans_results['optimal_k_elbow']}")
print(f"Optimal K (Silhouette): {kmeans_results['optimal_k_silhouette']}")

# Show silhouette scores for different K values
results_df = pd.DataFrame({
    'K': kmeans_results['k_values'],
    'Inertia': kmeans_results['inertias'],
    'Silhouette_Score': kmeans_results['silhouette_scores']
})

print("\nK-Means Results Summary:")
print(results_df)

In [None]:
# Plot elbow curve
analyzer.plot_elbow_curve(kmeans_results)

In [None]:
# Get predictions from best K-Means model
best_kmeans = kmeans_results['best_model']
kmeans_labels = best_kmeans.predict(X_pca)

# Evaluate K-Means clustering
kmeans_metrics = analyzer.evaluate_clustering(X_pca, kmeans_labels, y_true)

print("K-Means Clustering Evaluation:")
for metric, value in kmeans_metrics.items():
    print(f"{metric}: {value:.4f}")

## 3. Hierarchical Clustering Analysis

In [None]:
# Perform hierarchical clustering
print("Performing Hierarchical clustering analysis...")
hierarchical_results = analyzer.hierarchical_clustering(X_pca, n_clusters=3, linkage_method='ward')

# Evaluate hierarchical clustering
hierarchical_metrics = analyzer.evaluate_clustering(
    X_pca, 
    hierarchical_results['cluster_labels'], 
    y_true
)

print("\nHierarchical Clustering Evaluation:")
for metric, value in hierarchical_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Plot dendrogram
analyzer.plot_dendrogram(hierarchical_results)

## 4. Clustering Visualization

In [None]:
# Visualize K-Means clusters
print("K-Means Clustering Visualization:")
analyzer.plot_cluster_scatter(X_pca, kmeans_labels, y_true)

In [None]:
# Visualize Hierarchical clusters
print("Hierarchical Clustering Visualization:")
analyzer.plot_cluster_scatter(X_pca, hierarchical_results['cluster_labels'], y_true)

## 5. Comparison with True Labels

In [None]:
# Compare K-Means results with true labels
print("K-Means vs True Labels Analysis:")
kmeans_comparison = analyzer.compare_with_true_labels(kmeans_labels, y_true)

print("\nCross-tabulation (K-Means):")
print(kmeans_comparison['crosstab'])

print(f"\nAdjusted Rand Score: {kmeans_comparison['adjusted_rand_score']:.4f}")

In [None]:
# Compare Hierarchical results with true labels
print("Hierarchical vs True Labels Analysis:")
hierarchical_comparison = analyzer.compare_with_true_labels(
    hierarchical_results['cluster_labels'], y_true
)

print("\nCross-tabulation (Hierarchical):")
print(hierarchical_comparison['crosstab'])

print(f"\nAdjusted Rand Score: {hierarchical_comparison['adjusted_rand_score']:.4f}")

## 6. Cluster Purity Analysis

In [None]:
# Analyze cluster purity for K-Means
print("K-Means Cluster Purity Analysis:")
for cluster_id, purity_info in kmeans_comparison['cluster_purity'].items():
    print(f"Cluster {cluster_id}:")
    print(f"  - Purity: {purity_info['purity']:.3f}")
    print(f"  - Size: {purity_info['size']} samples")
    print(f"  - Dominant Label: {purity_info['dominant_label']}")
    print()

In [None]:
# Analyze cluster purity for Hierarchical
print("Hierarchical Cluster Purity Analysis:")
for cluster_id, purity_info in hierarchical_comparison['cluster_purity'].items():
    print(f"Cluster {cluster_id}:")
    print(f"  - Purity: {purity_info['purity']:.3f}")
    print(f"  - Size: {purity_info['size']} samples")
    print(f"  - Dominant Label: {purity_info['dominant_label']}")
    print()

## 7. Comprehensive Analysis and Comparison

In [None]:
# Create comparison summary
comparison_summary = pd.DataFrame({
    'Method': ['K-Means', 'Hierarchical'],
    'Silhouette_Score': [
        kmeans_metrics['silhouette_score'],
        hierarchical_metrics['silhouette_score']
    ],
    'Adjusted_Rand_Score': [
        kmeans_metrics['adjusted_rand_score'],
        hierarchical_metrics['adjusted_rand_score']
    ],
    'N_Clusters': [
        kmeans_metrics['n_clusters'],
        hierarchical_metrics['n_clusters']
    ],
    'Inertia': [
        kmeans_metrics['inertia'],
        hierarchical_metrics['inertia']
    ]
})

print("Clustering Methods Comparison:")
print(comparison_summary)

In [None]:
# Visualize comparison metrics
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Silhouette scores
methods = comparison_summary['Method']
silhouette_scores = comparison_summary['Silhouette_Score']
axes[0].bar(methods, silhouette_scores, color=['skyblue', 'lightcoral'])
axes[0].set_title('Silhouette Score Comparison')
axes[0].set_ylabel('Silhouette Score')
axes[0].set_ylim(0, max(silhouette_scores) * 1.1)

# Add value labels on bars
for i, v in enumerate(silhouette_scores):
    axes[0].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

# Adjusted Rand scores
rand_scores = comparison_summary['Adjusted_Rand_Score']
axes[1].bar(methods, rand_scores, color=['lightgreen', 'orange'])
axes[1].set_title('Adjusted Rand Score Comparison')
axes[1].set_ylabel('Adjusted Rand Score')
axes[1].set_ylim(0, max(rand_scores) * 1.1)

# Add value labels on bars
for i, v in enumerate(rand_scores):
    axes[1].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 8. Clustering on Original Features

In [None]:
# Compare clustering performance on original vs PCA features
print("Clustering Analysis on Original Features:")

# K-Means on original features
analyzer_original = ClusteringAnalyzer(random_state=42)
X_original_scaled, _ = analyzer_original.prepare_clustering_data(cleaned_data, use_pca=False)

kmeans_original_results = analyzer_original.kmeans_clustering(X_original_scaled, k_range=(2, 6))
best_kmeans_original = kmeans_original_results['best_model']
kmeans_original_labels = best_kmeans_original.predict(X_original_scaled)

# Evaluate on original features
kmeans_original_metrics = analyzer_original.evaluate_clustering(
    X_original_scaled, kmeans_original_labels, y_true
)

print("\nK-Means on Original Features:")
for metric, value in kmeans_original_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Compare PCA vs Original features performance
feature_comparison = pd.DataFrame({
    'Features': ['PCA', 'Original'],
    'Silhouette_Score': [
        kmeans_metrics['silhouette_score'],
        kmeans_original_metrics['silhouette_score']
    ],
    'Adjusted_Rand_Score': [
        kmeans_metrics['adjusted_rand_score'],
        kmeans_original_metrics['adjusted_rand_score']
    ],
    'N_Features': [X_pca.shape[1], X_original_scaled.shape[1]]
})

print("\nPCA vs Original Features Comparison:")
print(feature_comparison)

## 9. Save Results and Models

In [None]:
# Create cluster assignments dataframe
cluster_assignments = pd.DataFrame({
    'sample_id': range(len(y_true)),
    'true_label': y_true,
    'kmeans_cluster': kmeans_labels,
    'hierarchical_cluster': hierarchical_results['cluster_labels'],
    'kmeans_original_cluster': kmeans_original_labels
})

# Save cluster assignments
os.makedirs('../results/clustering', exist_ok=True)
cluster_assignments.to_csv('../results/clustering/cluster_assignments.csv', index=False)

print("Cluster assignments saved to: ../results/clustering/cluster_assignments.csv")
print("\nFirst 10 assignments:")
print(cluster_assignments.head(10))

In [None]:
# Save clustering models
analyzer.save_models('../models/unsupervised')
print("Clustering models saved successfully!")

## 10. Summary and Conclusions

In [None]:
print("UNSUPERVISED LEARNING ANALYSIS SUMMARY")
print("=" * 50)

print(f"\nDataset Information:")
print(f"- Total Samples: {len(y_true)}")
print(f"- PCA Features: {X_pca.shape[1]}")
print(f"- Original Features: {X_original_scaled.shape[1]}")
print(f"- True Classes: {len(np.unique(y_true))} (Heart Disease: No=0, Yes=1)")

print(f"\nBest Clustering Results:")
print(f"- K-Means (PCA): Silhouette={kmeans_metrics['silhouette_score']:.3f}, ARI={kmeans_metrics['adjusted_rand_score']:.3f}")
print(f"- Hierarchical (PCA): Silhouette={hierarchical_metrics['silhouette_score']:.3f}, ARI={hierarchical_metrics['adjusted_rand_score']:.3f}")
print(f"- K-Means (Original): Silhouette={kmeans_original_metrics['silhouette_score']:.3f}, ARI={kmeans_original_metrics['adjusted_rand_score']:.3f}")

print(f"\nKey Findings:")
if kmeans_metrics['silhouette_score'] > hierarchical_metrics['silhouette_score']:
    print("- K-Means clustering shows better internal cluster quality (silhouette score)")
else:
    print("- Hierarchical clustering shows better internal cluster quality (silhouette score)")

if kmeans_metrics['adjusted_rand_score'] > hierarchical_metrics['adjusted_rand_score']:
    print("- K-Means clustering aligns better with true heart disease labels")
else:
    print("- Hierarchical clustering aligns better with true heart disease labels")

if kmeans_metrics['silhouette_score'] > kmeans_original_metrics['silhouette_score']:
    print("- PCA features provide better clustering performance than original features")
else:
    print("- Original features provide better clustering performance than PCA features")

print(f"\nRecommendations:")
print("- The clustering analysis reveals natural groupings in the heart disease data")
print("- Consider using clustering results as additional features for supervised learning")
print("- The moderate ARI scores suggest that heart disease patterns are complex")
print("- Further investigation of cluster characteristics could provide clinical insights")

print("\nAnalysis completed successfully!")