In [4]:
%load_ext autoreload
%autoreload 2
%cd code

c:\Users\xavid\Documents\GitHub\Clustering-Algorithms\code


In [25]:
import pandas as pd
import numpy as np

In [31]:
def summarize_centroid_differences(kmeans, column_names, top_n=5):
    """
    Summarizes centroid differences to identify key defining features.
    
    Parameters:
    - kmeans: Fitted CustomKMeans object.
    - top_n: Number of most significant features to highlight.
    
    Returns:
    - Summary DataFrame of top defining features and their differences.
    """
    centroids = kmeans.centroids
    n_clusters, n_features = centroids.shape
    
    # Normalize centroids for comparability
    normalized_centroids = (centroids - centroids.min(axis=0)) / (centroids.max(axis=0) - centroids.min(axis=0) + 1e-10)
    
    # Calculate variance or range of each feature across clusters
    feature_importance = np.ptp(normalized_centroids, axis=0)  # Range of each feature
    feature_ranking = np.argsort(-feature_importance)  # Sort descending by importance
    
    # Select top N most significant features
    top_features = feature_ranking[:top_n]
    top_feature_names = [column_names[idx] for idx in top_features]
    
    # Prepare summary
    summary = {
        "Feature": top_feature_names,  # Use indices from top_features
        "Range Across Clusters": feature_importance[top_features]
    }
    
    # Add centroid values for each cluster
    for cluster_idx in range(n_clusters):
        summary[f"Cluster {cluster_idx+1}"] = centroids[cluster_idx, top_features]

    summary_df = pd.DataFrame(summary)
    return summary_df


In [35]:
from code.kmeans import CustomKMeans

datasets = ['grid', 'vowel', 'sick']

for dataset in datasets:
    file = f'../datasets_processed/{dataset}.csv'
    df = pd.read_csv(file)
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    X = np.array(X)
    y = np.array(y)
    column_names = df.columns[:-1]
    random_indices = np.random.choice(len(X), 3, replace=False)
    centroids = X[random_indices]
    kmeans = CustomKMeans(n_clusters=3, init=centroids, distance='euclidean', max_iters=100, tolerance=1e-4)
    
    kmeans.fit(X)
    diff_df = summarize_centroid_differences(kmeans, column_names, top_n=5)
    print(dataset)
    display(diff_df)

grid


Unnamed: 0,Feature,Range Across Clusters,Cluster 1,Cluster 2,Cluster 3
0,y,1.0,0.553301,0.697027,0.236878
1,x,1.0,0.769616,0.322027,0.419913


vowel


Unnamed: 0,Feature,Range Across Clusters,Cluster 1,Cluster 2,Cluster 3
0,Sex,1.0,1.0,0.0,0.0
1,Speaker_Number_b'Sue',1.0,0.0,0.333333,0.0
2,Speaker_Number_b'Rose',1.0,0.0,0.333333,0.0
3,Speaker_Number_b'Wendy',1.0,0.0,0.333333,0.0
4,Speaker_Number_b'Jo',1.0,0.0,0.0,0.25


sick


Unnamed: 0,Feature,Range Across Clusters,Cluster 1,Cluster 2,Cluster 3
0,referral_source_b'other',1.0,0.998633,0.012678,0.0
1,sex_b'M',1.0,0.237813,0.982567,0.0
2,sex_b'F',1.0,0.720729,0.0,0.949153
3,referral_source_b'SVI',1.0,0.0,0.645008,0.663136
4,referral_source_b'SVHC',1.0,0.0,0.324881,0.191737


Unnamed: 0,x,y
0,0.500000,0.520833
1,0.500000,0.541667
2,0.500000,0.562500
3,0.500000,0.583333
4,0.500000,0.604167
...,...,...
1883,0.020833,0.437500
1884,0.020833,0.416667
1885,0.020833,0.395833
1886,0.020833,0.375000
