In [81]:
import pandas as pd
from kmodes.kprototypes import KPrototypes
from sklearn.metrics import silhouette_score

In [82]:
# Load dataset
data = pd.read_csv("cvd_adj.csv")

In [83]:
# Select numerical and categorical columns
numerical_columns = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 
                     'Green_Vegetables_Consumption', 'FriedPotato_Consumption']
categorical_columns = data.select_dtypes(include=['bool']).columns.tolist()

In [84]:
# Combine numerical and categorical columns
data_combined = data[numerical_columns + categorical_columns]

# Convert boolean columns to integers
data_combined[categorical_columns] = data_combined[categorical_columns].astype(int)

In [85]:
# Sample 10% of the data
sampled_data = data_combined.sample(frac=0.01, random_state=42)

In [86]:
# Running K-Prototypes for different cluster numbers and calculating silhouette score
silhouette_scores = []
cluster_range = range(2, 11)

In [87]:
for n_clusters in cluster_range:
    kproto = KPrototypes(n_clusters=n_clusters, random_state=42)
    clusters = kproto.fit_predict(sampled_data, categorical=[sampled_data.columns.get_loc(col) for col in categorical_columns])
    
    # Calculating silhouette score
    silhouette_avg = silhouette_score(sampled_data, clusters, metric='euclidean')
    silhouette_scores.append(silhouette_avg)

In [88]:
# Print or visualize the silhouette scores
print(pd.DataFrame({
    'Number of Clusters': list(cluster_range),
    'Silhouette Score': silhouette_scores
}))

   Number of Clusters  Silhouette Score
0                   2          0.366367
1                   3          0.266975
2                   4          0.278665
3                   5          0.225604
4                   6          0.216377
5                   7          0.194014
6                   8          0.204053
7                   9          0.193069
8                  10          0.193362
