# Clustering summaries

In [15]:
import pandas as pd

In [16]:
df = pd.read_csv('data/ilpd-clustered.csv')

In [17]:
clustering_columns = [col for col in df.columns if 'cluster' in col.lower()]

numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
feature_columns = [col for col in numerical_columns if col not in clustering_columns]

In [None]:
import numpy as np
from collections import defaultdict

clustering_stats = {}

# Calculate statistics for each clustering method
for cluster_col in clustering_columns:
    unique_clusters = sorted(df[cluster_col].unique())
    
    clustering_stats[cluster_col] = {}
    
    # Calculate statistics for each cluster
    for cluster in unique_clusters:
        cluster_data = df[df[cluster_col] == cluster][feature_columns]
        
        stats = {
            'mean': cluster_data.mean(),
            'median': cluster_data.median(),
            'std': cluster_data.std()
        }
        
        clustering_stats[cluster_col][f'cluster_{cluster}'] = stats

In [None]:
# Create organized summary DataFrames
summary_dfs = {}

for cluster_method in clustering_columns:
    summary_data = []
    
    for cluster_name, stats in clustering_stats[cluster_method].items():
        cluster_id = cluster_name.split('_')[-1]
        
        for feature in feature_columns:
            summary_data.append({
                'Clustering_Method': cluster_method,
                'Cluster': cluster_id,
                'Feature': feature,
                'Mean': stats['mean'][feature],
                'Median': stats['median'][feature],
                'Std_Dev': stats['std'][feature],
                'Sample_Count': len(df[df[cluster_method] == int(cluster_id)])
            })
    
    summary_df = pd.DataFrame(summary_data)
    summary_dfs[cluster_method] = summary_df

all_summaries = pd.concat(summary_dfs.values(), ignore_index=True)

In [20]:
all_summaries

Unnamed: 0,Clustering_Method,Cluster,Feature,Mean,Median,Std_Dev,Sample_Count
0,Clusters_KProto_k2,0,Age,44.973592,45.0,16.220298,568
1,Clusters_KProto_k2,0,TB,3.189613,1.0,6.128735,568
2,Clusters_KProto_k2,0,DB,1.436092,0.3,2.772506,568
3,Clusters_KProto_k2,0,Alkphos,288.406690,208.0,238.708965,568
4,Clusters_KProto_k2,0,Sgpt,59.054577,35.0,74.341759,568
...,...,...,...,...,...,...,...
425,Clusters_Spectral_k6,5,Sgot,81.120000,59.0,68.413086,75
426,Clusters_Spectral_k6,5,TP,5.842667,5.8,1.144691,75
427,Clusters_Spectral_k6,5,ALB,2.238667,2.1,0.529378,75
428,Clusters_Spectral_k6,5,A/G Ratio,0.593733,0.6,0.134444,75
