In [5]:
%load_ext autoreload
%autoreload 2
%cd code

c:\Users\xavid\Documents\GitHub\Clustering-Algorithms\code


In [6]:
import pandas as pd
import numpy as np

In [7]:
def summarize_centroid_differences(kmeans, column_names, top_n=5):
    """
    Summarizes centroid differences to identify key defining features.
    
    Parameters:
    - kmeans: Fitted CustomKMeans object.
    - top_n: Number of most significant features to highlight.
    
    Returns:
    - Summary DataFrame of top defining features and their differences.
    """
    centroids = kmeans.centroids
    n_clusters, n_features = centroids.shape
    
    # Normalize centroids for comparability
    normalized_centroids = (centroids - centroids.min(axis=0)) / (centroids.max(axis=0) - centroids.min(axis=0) + 1e-10)
    
    # Calculate variance or range of each feature across clusters
    feature_importance = np.ptp(normalized_centroids, axis=0)  # Range of each feature
    feature_ranking = np.argsort(-feature_importance)  # Sort descending by importance
    
    # Select top N most significant features
    top_features = feature_ranking[:top_n]
    top_feature_names = [column_names[idx] for idx in top_features]
    
    # Prepare summary
    summary = {
        "Feature": top_feature_names,  # Use indices from top_features
        "Range Across Clusters": feature_importance[top_features]
    }
    
    # Add centroid values for each cluster
    for cluster_idx in range(n_clusters):
        summary[f"Cluster {cluster_idx+1}"] = centroids[cluster_idx, top_features]

    summary_df = pd.DataFrame(summary)
    return summary_df


In [8]:
from code.kmeans import CustomKMeans

datasets = ['grid', 'vowel', 'sick']

for dataset in datasets:
    file = f'../datasets_processed/{dataset}.csv'
    df = pd.read_csv(file)
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    X = np.array(X)
    y = np.array(y)
    column_names = df.columns[:-1]
    random_indices = np.random.choice(len(X), 3, replace=False)
    centroids = X[random_indices]
    kmeans = CustomKMeans(n_clusters=3, init=centroids, distance='euclidean', max_iters=100, tolerance=1e-4)
    
    kmeans.fit(X)
    diff_df = summarize_centroid_differences(kmeans, column_names, top_n=5)
    print(dataset)
    display(diff_df)



grid


Unnamed: 0,Feature,Range Across Clusters,Cluster 1,Cluster 2,Cluster 3
0,y,1.0,0.697027,0.553301,0.236878
1,x,1.0,0.322027,0.769616,0.419913


vowel


Unnamed: 0,Feature,Range Across Clusters,Cluster 1,Cluster 2,Cluster 3
0,Sex,1.0,1.0,0.0,1.0
1,Speaker_Number_b'Rich',1.0,0.0,0.0,0.25
2,Speaker_Number_b'Nick',1.0,0.0,0.0,0.25
3,Speaker_Number_b'Mike',1.0,0.25,0.0,0.0
4,Speaker_Number_b'Mark',1.0,0.25,0.0,0.0


sick


Unnamed: 0,Feature,Range Across Clusters,Cluster 1,Cluster 2,Cluster 3
0,referral_source_b'SVI',1.0,0.0,0.0,1.0
1,referral_source_b'other',1.0,0.998638,0.0,0.0
2,referral_source_b'SVHC',1.0,0.0,0.722846,0.0
3,psych,1.0,0.006809,0.297753,0.009681
4,T3_measured,1.0,0.683613,0.955056,0.953533
