# Clustering summaries

In [45]:
import pandas as pd
from collections import defaultdict
import numpy as np

In [46]:
df = pd.read_csv('data/ilpd-clustered.csv')

In [47]:
clustering_columns = [col for col in df.columns if 'cluster' in col.lower()]

numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
feature_columns = [col for col in numerical_columns if col not in clustering_columns]

In [48]:
# Calculate gender proportions for each clustering method
gender_stats = {}

for cluster_col in clustering_columns:
    unique_clusters = sorted(df[cluster_col].unique())
    
    gender_stats[cluster_col] = {}
    
    for cluster in unique_clusters:
        cluster_data = df[df[cluster_col] == cluster]
        gender_counts = cluster_data['Gender'].value_counts()
        total_count = len(cluster_data)
        
        gender_proportions = gender_counts / total_count
        
        gender_stats[cluster_col][f'cluster_{cluster}'] = {
            'total_count': total_count,
            'gender_counts': gender_counts.to_dict(),
            'gender_proportions': gender_proportions.to_dict()
        }

In [49]:
# Create comprehensive summary DataFrames including gender
comprehensive_summary = []

for cluster_method in clustering_columns:
    unique_clusters = sorted(df[cluster_method].unique())
    
    for cluster in unique_clusters:
        cluster_id = str(cluster)
        
        cluster_data = df[df[cluster_method] == cluster][feature_columns]
        means = cluster_data.mean()
        medians = cluster_data.median()
        stds = cluster_data.std()
        
        gender_info = gender_stats[cluster_method][f'cluster_{cluster}']
        
        base_row = {
            'Clustering_Method': cluster_method,
            'Cluster': cluster_id,
            'Sample_Count': gender_info['total_count']
        }
        
        for feature in feature_columns:
            base_row[f'{feature}_Mean'] = means[feature]
            base_row[f'{feature}_Median'] = medians[feature]
            base_row[f'{feature}_Std'] = stds[feature]
        
        for gender, proportion in gender_info['gender_proportions'].items():
            base_row[f'Gender_{gender}_Proportion'] = proportion
            base_row[f'Gender_{gender}_Count'] = gender_info['gender_counts'][gender]
        
        comprehensive_summary.append(base_row)

comprehensive_df = pd.DataFrame(comprehensive_summary)

In [55]:
comprehensive_df

Unnamed: 0,Clustering_Method,Cluster,Sample_Count,Age_Mean,Age_Median,Age_Std,TB_Mean,TB_Median,TB_Std,DB_Mean,...,A/G Ratio_Mean,A/G Ratio_Median,A/G Ratio_Std,Label_Mean,Label_Median,Label_Std,Gender_Male_Proportion,Gender_Male_Count,Gender_Female_Proportion,Gender_Female_Count
0,Clusters_KProto_k2,0,568,44.973592,45.0,16.220298,3.189613,1.0,6.128735,1.436092,...,0.946391,0.925,0.319795,1.290493,1.0,0.45439,0.755282,429.0,0.244718,139.0
1,Clusters_KProto_k2,1,11,34.909091,34.0,13.523717,9.809091,6.2,8.045304,4.490909,...,0.981818,1.1,0.321926,1.0,1.0,0.0,0.909091,10.0,0.090909,1.0
2,Clusters_KProto_k4,0,43,45.930233,46.0,13.286935,20.937209,18.4,10.873056,9.797674,...,0.797442,0.7,0.544588,1.0,1.0,0.0,0.837209,36.0,0.162791,7.0
3,Clusters_KProto_k4,1,519,44.880539,45.0,16.449795,1.696339,0.9,1.73556,0.731599,...,0.96183,1.0,0.290406,1.317919,1.0,0.466117,0.747592,388.0,0.252408,131.0
4,Clusters_KProto_k4,2,2,49.0,49.0,24.041631,12.0,12.0,0.989949,5.9,...,0.9,0.9,0.565685,1.0,1.0,0.0,1.0,2.0,,
5,Clusters_KProto_k4,3,15,37.533333,34.0,14.764178,7.66,5.7,7.287837,3.486667,...,0.871333,0.9,0.31715,1.0,1.0,0.0,0.866667,13.0,0.133333,2.0
6,Clusters_KProto_k5,0,494,45.058704,45.0,16.539172,1.619838,0.9,1.669877,0.676721,...,0.959555,1.0,0.293385,1.331984,1.0,0.471403,0.746964,369.0,0.253036,125.0
7,Clusters_KProto_k5,1,2,49.0,49.0,24.041631,12.0,12.0,0.989949,5.9,...,0.9,0.9,0.565685,1.0,1.0,0.0,1.0,2.0,,
8,Clusters_KProto_k5,2,33,43.545455,44.0,15.270664,4.163636,3.7,3.399524,2.20303,...,0.916364,1.0,0.276392,1.030303,1.0,0.174078,0.787879,26.0,0.212121,7.0
9,Clusters_KProto_k5,3,41,45.097561,45.0,13.024601,21.319512,18.5,10.986042,9.968293,...,0.811951,0.7,0.553413,1.0,1.0,0.0,0.829268,34.0,0.170732,7.0
