In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, AgglomerativeClustering, OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
np.infty = np.inf  # Monkey patch for compatibility

import umap
import warnings
import hdbscan

In [None]:
class ClusteringFeatureSelector:
    def __init__(self, data, n_features=10):
        self.data = data
        self.n_features = n_features
        self.scaler = StandardScaler()
        self.scaled_data = self.scaler.fit_transform(data)
        
    def pca_based_selection(self):
        """Select features based on PCA component loadings"""
        # Fix: Use all components
        pca = PCA()
        pca.fit(self.scaled_data)
        
        # Method 1: Sum of absolute loadings across components
        absolute_components = np.abs(pca.components_)
        feature_importance = absolute_components.sum(axis=0)
        
        # Method 2: Weighted by explained variance
        weighted_importance = np.sum(absolute_components * pca.explained_variance_ratio_[:, np.newaxis], axis=0)
        
        top_indices = weighted_importance.argsort()[-self.n_features:]
        return self.data.columns[top_indices].tolist(), weighted_importance[top_indices]
    
    def variance_based_selection(self):
        """Select features with highest variance"""
        variances = self.data.var()
        top_features = variances.nlargest(self.n_features)
        return top_features.index.tolist(), top_features.values
    
    def correlation_based_selection(self, threshold=0.9):
        """Remove highly correlated features"""
        corr_matrix = self.data.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        
        # Find features with correlation greater than threshold
        to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
        selected_features = [col for col in self.data.columns if col not in to_drop]
        
        # If still too many features, combine with variance
        if len(selected_features) > self.n_features:
            variances = self.data[selected_features].var()
            selected_features = variances.nlargest(self.n_features).index.tolist()
            
        return selected_features, None
    
    def mutual_information_selection(self):
        """Select features based on mutual information"""
        # Create a synthetic target for unsupervised learning
        # Using first principal component as proxy
        pca = PCA(n_components=1)
        target = pca.fit_transform(self.scaled_data).ravel()
        
        mi_scores = mutual_info_regression(self.scaled_data, target, random_state=42)
        top_indices = mi_scores.argsort()[-self.n_features:]
        
        return self.data.columns[top_indices].tolist(), mi_scores[top_indices]
    
    def random_forest_importance(self):
        """Use Random Forest feature importance"""
        # Create synthetic target using k-means cluster labels
        kmeans = KMeans(n_clusters=3, random_state=42)
        target = kmeans.fit_predict(self.scaled_data)
        
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(self.scaled_data, target)
        
        importances = rf.feature_importances_
        top_indices = importances.argsort()[-self.n_features:]
        
        return self.data.columns[top_indices].tolist(), importances[top_indices]
    
    def lasso_selection(self, alpha=0.01):
        """Use LASSO regularization for feature selection"""
        # Create synthetic target
        pca = PCA(n_components=1)
        target = pca.fit_transform(self.scaled_data).ravel()
        
        lasso = Lasso(alpha=alpha, random_state=42)
        lasso.fit(self.scaled_data, target)
        
        # Get non-zero coefficients
        non_zero_indices = np.where(np.abs(lasso.coef_) > 0)[0]
        
        if len(non_zero_indices) > self.n_features:
            # Sort by absolute coefficient value
            sorted_indices = non_zero_indices[np.argsort(np.abs(lasso.coef_[non_zero_indices]))[-self.n_features:]]
            return self.data.columns[sorted_indices].tolist(), np.abs(lasso.coef_[sorted_indices])
        else:
            return self.data.columns[non_zero_indices].tolist(), np.abs(lasso.coef_[non_zero_indices])
    
    def silhouette_based_selection(self, n_clusters=3, n_iterations=10):
        """Evaluate feature subsets using silhouette score"""
        feature_scores = {}
        
        for i, feature in enumerate(self.data.columns):
            scores = []
            for _ in range(n_iterations):
                # Random subset including current feature
                other_features = np.random.choice(
                    [j for j in range(len(self.data.columns)) if j != i],
                    size=min(self.n_features-1, len(self.data.columns)-1),
                    replace=False
                )
                feature_subset = np.append(other_features, i)
                
                # Cluster with subset
                kmeans = KMeans(n_clusters=n_clusters, random_state=42)
                labels = kmeans.fit_predict(self.scaled_data[:, feature_subset])
                
                # Calculate silhouette score
                if len(np.unique(labels)) > 1:
                    score = silhouette_score(self.scaled_data[:, feature_subset], labels)
                    scores.append(score)
            
            feature_scores[feature] = np.mean(scores) if scores else 0
        
        # Sort by average silhouette score
        sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
        top_features = [f[0] for f in sorted_features[:self.n_features]]
        top_scores = [f[1] for f in sorted_features[:self.n_features]]
        
        return top_features, top_scores
    
    def ensemble_selection(self, methods=['pca', 'variance', 'mutual_info', 'random_forest']):
        """Combine multiple methods using voting"""
        feature_votes = {}
        
        method_map = {
            'pca': self.pca_based_selection,
            'variance': self.variance_based_selection,
            'mutual_info': self.mutual_information_selection,
            'random_forest': self.random_forest_importance,
            'correlation': self.correlation_based_selection,
            'lasso': self.lasso_selection
        }
        
        for method in methods:
            if method in method_map:
                features, _ = method_map[method]()
                for i, feature in enumerate(features):
                    if feature not in feature_votes:
                        feature_votes[feature] = 0
                    # Weight by rank (higher rank = more points)
                    feature_votes[feature] += len(features) - i
        
        # Sort by votes
        sorted_features = sorted(feature_votes.items(), key=lambda x: x[1], reverse=True)
        top_features = [f[0] for f in sorted_features[:self.n_features]]
        
        return top_features
    
    def visualize_feature_importance(self, method='all'):
        """Visualize feature importance from different methods"""
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        axes = axes.ravel()
        
        methods = {
            'PCA-based': self.pca_based_selection,
            'Variance': self.variance_based_selection,
            'Mutual Information': self.mutual_information_selection,
            'Random Forest': self.random_forest_importance,
            'LASSO': self.lasso_selection,
        }
        
        for idx, (name, method_func) in enumerate(methods.items()):
            features, scores = method_func()
            if scores is not None:
                axes[idx].barh(range(len(features)), scores)
                axes[idx].set_yticks(range(len(features)))
                axes[idx].set_yticklabels(features)
                axes[idx].set_title(f'{name} Feature Importance')
                axes[idx].set_xlabel('Importance Score')
        
        # Use the last subplot for ensemble results
        ensemble_features = self.ensemble_selection()
        axes[-1].barh(range(len(ensemble_features)), range(len(ensemble_features), 0, -1))
        axes[-1].set_yticks(range(len(ensemble_features)))
        axes[-1].set_yticklabels(ensemble_features)
        axes[-1].set_title('Ensemble Feature Selection')
        axes[-1].set_xlabel('Combined Rank')
        
        plt.tight_layout()
        plt.show()



In [None]:
data = pd.read_csv('AppML_InitialProject_test_clustering.csv')

# Initialize feature selector
selector = ClusteringFeatureSelector(data, n_features=10)

# Method 1: PCA-based selection (improved)
print("PCA-based feature selection:")
features, scores = selector.pca_based_selection()
print(features)
print()

# Method 2: Variance-based selection
print("Variance-based feature selection:")
features, scores = selector.variance_based_selection()
print(features)
print()

# Method 3: Correlation-based selection
print("Correlation-based feature selection (removing highly correlated):")
features, _ = selector.correlation_based_selection(threshold=0.9)
print(features[:10])
print()

# Method 4: Mutual Information
print("Mutual Information feature selection:")
features, scores = selector.mutual_information_selection()
print(features)
print()

# Method 5: Random Forest importance
print("Random Forest feature importance:")
features, scores = selector.random_forest_importance()
print(features)
print()

# Method 6: LASSO selection
print("LASSO feature selection:")
features, scores = selector.lasso_selection()
print(features)
print()

# Method 7: Ensemble selection
print("Ensemble feature selection (combining multiple methods):")
ensemble_features = selector.ensemble_selection()
print(ensemble_features)

# Visualize all methods
selector.visualize_feature_importance()

selected_data = data[ensemble_features]
scaler = StandardScaler()
scaled_selected = scaler.fit_transform(selected_data)

# Perform clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(scaled_selected)

# Evaluate clustering quality
silhouette_avg = silhouette_score(scaled_selected, clusters)
print(f"\nSilhouette Score with selected features: {silhouette_avg:.3f}")

In [None]:
class AdvancedClusteringAnalysis:
    def __init__(self, data, selected_columns=None):
        """
        Initialize clustering analysis
        
        Parameters:
        -----------
        data : pd.DataFrame
            Input data
        selected_columns : list
            List of column names to use for clustering
        """
        self.data = data
        self.selected_columns = selected_columns if selected_columns else data.columns.tolist()
        self.X = data[self.selected_columns].values
        
        # Standardize the data
        self.scaler = StandardScaler()
        self.X_scaled = self.scaler.fit_transform(self.X)
        
        # Store dimensionality reduction results
        self.dim_reduction_results = {}
        
    def perform_dimensionality_reduction(self):
        """Perform PCA, t-SNE, and UMAP"""
        print("Performing dimensionality reduction...")
        
        # PCA
        pca = PCA(n_components=2)
        self.dim_reduction_results['PCA'] = pca.fit_transform(self.X_scaled)
        print(f"PCA explained variance ratio: {pca.explained_variance_ratio_}")
        
        # t-SNE
        tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
        self.dim_reduction_results['t-SNE'] = tsne.fit_transform(self.X_scaled)
        
        # UMAP
        umap_reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
        self.dim_reduction_results['UMAP'] = umap_reducer.fit_transform(self.X_scaled)
        
        print("Dimensionality reduction completed!")
        
    def find_optimal_kmeans_clusters(self, max_clusters=15):
        """Find optimal number of clusters for K-means using multiple methods"""
        print("\nFinding optimal number of clusters for K-means...")
        
        inertias = []
        silhouette_scores = []
        davies_bouldin_scores = []
        calinski_scores = []
        
        K = range(2, max_clusters + 1)
        
        for k in K:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            labels = kmeans.fit_predict(self.X_scaled)
            
            inertias.append(kmeans.inertia_)
            silhouette_scores.append(silhouette_score(self.X_scaled, labels))
            davies_bouldin_scores.append(davies_bouldin_score(self.X_scaled, labels))
            calinski_scores.append(calinski_harabasz_score(self.X_scaled, labels))
        
        # Plot evaluation metrics
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # Elbow method
        axes[0, 0].plot(K, inertias, 'bo-')
        axes[0, 0].set_xlabel('Number of clusters (k)')
        axes[0, 0].set_ylabel('Inertia')
        axes[0, 0].set_title('Elbow Method')
        axes[0, 0].grid(True)
        
        # Silhouette score
        axes[0, 1].plot(K, silhouette_scores, 'ro-')
        axes[0, 1].set_xlabel('Number of clusters (k)')
        axes[0, 1].set_ylabel('Silhouette Score')
        axes[0, 1].set_title('Silhouette Analysis (Higher is better)')
        axes[0, 1].grid(True)
        
        # Davies-Bouldin Index
        axes[1, 0].plot(K, davies_bouldin_scores, 'go-')
        axes[1, 0].set_xlabel('Number of clusters (k)')
        axes[1, 0].set_ylabel('Davies-Bouldin Index')
        axes[1, 0].set_title('Davies-Bouldin Index (Lower is better)')
        axes[1, 0].grid(True)
        
        # Calinski-Harabasz Index
        axes[1, 1].plot(K, calinski_scores, 'mo-')
        axes[1, 1].set_xlabel('Number of clusters (k)')
        axes[1, 1].set_ylabel('Calinski-Harabasz Index')
        axes[1, 1].set_title('Calinski-Harabasz Index (Higher is better)')
        axes[1, 1].grid(True)
        
        plt.tight_layout()
        plt.show()
        
        # Find optimal k
        optimal_k_silhouette = K[np.argmax(silhouette_scores)]
        optimal_k_davies = K[np.argmin(davies_bouldin_scores)]
        
        print(f"Optimal k based on Silhouette Score: {optimal_k_silhouette}")
        print(f"Optimal k based on Davies-Bouldin Index: {optimal_k_davies}")
        
        return optimal_k_silhouette
    
    def find_optimal_dbscan_params(self):
        """Find optimal eps parameter for DBSCAN using k-distance graph"""
        print("\nFinding optimal DBSCAN parameters...")
        
        # Calculate k-distance graph
        neighbors = NearestNeighbors(n_neighbors=5)
        neighbors_fit = neighbors.fit(self.X_scaled)
        distances, indices = neighbors_fit.kneighbors(self.X_scaled)
        
        # Sort distances
        distances = np.sort(distances[:, -1], axis=0)
        
        # Plot k-distance graph
        plt.figure(figsize=(10, 6))
        plt.plot(distances)
        plt.xlabel('Points sorted by distance')
        plt.ylabel('4th Nearest Neighbor Distance')
        plt.title('K-distance Graph for DBSCAN eps Selection')
        plt.grid(True)
        plt.show()
        
        # Try different parameter combinations
        eps_values = np.linspace(np.percentile(distances, 90), np.percentile(distances, 99), 10)
        min_samples_values = [3, 4, 5, 6]
        
        results = []
        
        for eps in eps_values:
            for min_samples in min_samples_values:
                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                labels = dbscan.fit_predict(self.X_scaled)
                
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
                n_noise = list(labels).count(-1)
                
                if n_clusters > 1:  # Only calculate if we have clusters
                    # Filter out noise points for silhouette score
                    mask = labels != -1
                    if np.sum(mask) > 0 and len(set(labels[mask])) > 1:
                        sil_score = silhouette_score(self.X_scaled[mask], labels[mask])
                    else:
                        sil_score = -1
                else:
                    sil_score = -1
                
                results.append({
                    'eps': eps,
                    'min_samples': min_samples,
                    'n_clusters': n_clusters,
                    'n_noise': n_noise,
                    'silhouette': sil_score
                })
        
        results_df = pd.DataFrame(results)
        
        # Find best parameters
        valid_results = results_df[results_df['silhouette'] > 0]
        if not valid_results.empty:
            best_params = valid_results.loc[valid_results['silhouette'].idxmax()]
            print(f"\nBest DBSCAN parameters:")
            print(f"eps: {best_params['eps']:.3f}")
            print(f"min_samples: {int(best_params['min_samples'])}")
            print(f"Number of clusters: {int(best_params['n_clusters'])}")
            print(f"Noise points: {int(best_params['n_noise'])}")
            print(f"Silhouette score: {best_params['silhouette']:.3f}")
            
            return best_params['eps'], int(best_params['min_samples'])
        else:
            print("No valid DBSCAN parameters found. Using defaults.")
            return 0.5, 5
    
    def perform_clustering(self, optimal_k=None, eps=None, min_samples=None):
        """Perform various clustering algorithms"""
        print("\nPerforming clustering...")
        
        clustering_results = {}
        
        # K-means
        if optimal_k is None:
            optimal_k = self.find_optimal_kmeans_clusters()
        
        kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
        clustering_results['K-means'] = kmeans.fit_predict(self.X_scaled)
        
        # DBSCAN
        if eps is None or min_samples is None:
            eps, min_samples = self.find_optimal_dbscan_params()
        
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        clustering_results['DBSCAN'] = dbscan.fit_predict(self.X_scaled)
        
        # Gaussian Mixture Model
        gmm = GaussianMixture(n_components=optimal_k, random_state=42, covariance_type='full')
        clustering_results['GMM'] = gmm.fit_predict(self.X_scaled)
        
        # Spectral Clustering
        spectral = SpectralClustering(n_clusters=optimal_k, random_state=42, affinity='rbf')
        clustering_results['Spectral'] = spectral.fit_predict(self.X_scaled)
        
        # Hierarchical Clustering
        hierarchical = AgglomerativeClustering(n_clusters=optimal_k)
        clustering_results['Hierarchical'] = hierarchical.fit_predict(self.X_scaled)
        
        # OPTICS (similar to DBSCAN but more robust)
        optics = OPTICS(min_samples=min_samples, xi=0.05, min_cluster_size=0.05)
        clustering_results['OPTICS'] = optics.fit_predict(self.X_scaled)

        hdb = hdbscan.HDBSCAN(min_cluster_size=min_samples, min_samples=min_samples)
        clustering_results['HDBSCAN'] = hdb.fit_predict(self.X_scaled)
        
        self.clustering_results = clustering_results
        
        # Evaluate clustering results
        self.evaluate_clustering()
        
        return clustering_results
    
    def evaluate_clustering(self):
        """Evaluate all clustering results"""
        print("\nClustering Evaluation:")
        print("-" * 60)
        
        evaluation_results = []
        
        for name, labels in self.clustering_results.items():
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = list(labels).count(-1)
            
            # Calculate metrics only if we have valid clusters
            if n_clusters > 1:
                # Filter out noise points for metrics
                mask = labels != -1
                if np.sum(mask) > 0 and len(set(labels[mask])) > 1:
                    sil_score = silhouette_score(self.X_scaled[mask], labels[mask])
                    db_score = davies_bouldin_score(self.X_scaled[mask], labels[mask])
                    ch_score = calinski_harabasz_score(self.X_scaled[mask], labels[mask])
                else:
                    sil_score = db_score = ch_score = np.nan
            else:
                sil_score = db_score = ch_score = np.nan
            
            evaluation_results.append({
                'Algorithm': name,
                'Clusters': n_clusters,
                'Noise Points': n_noise,
                'Silhouette': sil_score,
                'Davies-Bouldin': db_score,
                'Calinski-Harabasz': ch_score
            })
            
        eval_df = pd.DataFrame(evaluation_results)
        print(eval_df.to_string(index=False, float_format='%.3f'))
        
        return eval_df
    
    def visualize_all_results(self):
        """Create comprehensive visualization of all results"""
        dim_methods = ['PCA', 't-SNE', 'UMAP']
        cluster_methods = list(self.clustering_results.keys())
        
        # Create subplots for each combination
        n_rows = len(cluster_methods)
        n_cols = len(dim_methods)
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
        if n_rows == 1:
            axes = axes.reshape(1, -1)
        
        for i, cluster_method in enumerate(cluster_methods):
            for j, dim_method in enumerate(dim_methods):
                ax = axes[i, j]
                
                # Get data
                X_reduced = self.dim_reduction_results[dim_method]
                labels = self.clustering_results[cluster_method]
                
                # Create scatter plot
                scatter = ax.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                                   c=labels, cmap='viridis', s=50, alpha=0.6)
                
                # Highlight noise points if any
                if -1 in labels:
                    noise_mask = labels == -1
                    ax.scatter(X_reduced[noise_mask, 0], X_reduced[noise_mask, 1], 
                             c='red', marker='x', s=50, label='Noise')
                
                ax.set_title(f'{cluster_method} - {dim_method}')
                ax.set_xlabel(f'{dim_method} Component 1')
                ax.set_ylabel(f'{dim_method} Component 2')
                
                # Add colorbar
                plt.colorbar(scatter, ax=ax)
        
        plt.tight_layout()
        plt.show()
    
    def plot_cluster_comparison(self):
        """Plot side-by-side comparison of different clustering algorithms"""
        # Use UMAP for visualization (usually gives best separation)
        X_viz = self.dim_reduction_results.get('UMAP', self.dim_reduction_results['PCA'])
        
        n_algorithms = len(self.clustering_results)
        n_cols = 3
        n_rows = (n_algorithms + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
        axes = axes.flatten()
        
        for idx, (name, labels) in enumerate(self.clustering_results.items()):
            ax = axes[idx]
            
            scatter = ax.scatter(X_viz[:, 0], X_viz[:, 1], c=labels, 
                               cmap='viridis', s=50, alpha=0.6)
            
            if -1 in labels:
                noise_mask = labels == -1
                ax.scatter(X_viz[noise_mask, 0], X_viz[noise_mask, 1], 
                         c='red', marker='x', s=50, label='Noise')
            
            ax.set_title(f'{name}')
            ax.set_xlabel('UMAP 1')
            ax.set_ylabel('UMAP 2')
            plt.colorbar(scatter, ax=ax)
        
        # Hide empty subplots
        for idx in range(len(self.clustering_results), len(axes)):
            axes[idx].set_visible(False)
        
        plt.tight_layout()
        plt.show()


In [None]:
# Create DataFrame
data = selected_data

analyzer = AdvancedClusteringAnalysis(data) 

analyzer.perform_dimensionality_reduction()

analyzer.perform_clustering()

analyzer.visualize_all_results()

analyzer.plot_cluster_comparison()
