In [None]:
!pip install ucimlrepo
!pip install pycaret[full]

In [None]:
import pandas as pd
from ucimlrepo import fetch_ucirepo
from pycaret.clustering import *
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

## Heart Disease Dataset
Additional Info: This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them.  In particular, the Cleveland database is the only one that has been used by ML researchers to date.  The "goal" field refers to the presence of heart disease in the patient.  It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0).  

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
X = heart_disease.data.features # features
Y = heart_disease.data.targets # goal (target variable)

print(X.head())
print(Y.head())

In [None]:
global result_df
result_df = pd.DataFrame()

def evaluate(data , models , clusters , id = 123):

  # No Pre-Processing
  for c in clusters:
    s = []
    setup(data = data, session_id = id , verbose=False)
    for model in models:
      df = assign_model(create_model(model, num_clusters = c)).dropna()
      X = df.drop(['Cluster'], axis=1)
      labels = df['Cluster']
      s.append(silhouette_score(X , labels))
      s.append(calinski_harabasz_score(X, labels))
      s.append(davies_bouldin_score(X, labels))

    result_df[f"A{c}"] = s

  # Using Normalzation
  for c in clusters:
    s = []
    setup(data = data, session_id = id , normalize = True, normalize_method = 'zscore', verbose=False)
    for model in models:
      df = assign_model(create_model(model, num_clusters = c)).dropna()
      X = df.drop(['Cluster'], axis=1)
      labels = df['Cluster']
      s.append(silhouette_score(X , labels))
      s.append(calinski_harabasz_score(X, labels))
      s.append(davies_bouldin_score(X, labels))

    result_df[f"B{c}"] = s

  # Using Transformation
  for c in clusters:
    s = []
    setup(data = data, session_id = id ,transformation = True, transformation_method = 'yeo-johnson', verbose=False)
    for model in models:
      df = assign_model(create_model(model, num_clusters = c)).dropna()
      X = df.drop(['Cluster'], axis=1)
      labels = df['Cluster']
      s.append(silhouette_score(X , labels))
      s.append(calinski_harabasz_score(X, labels))
      s.append(davies_bouldin_score(X, labels))

    result_df[f"C{c}"] = s

  # Using PCA
  for c in clusters:
    s = []
    setup(data = data, session_id = id , pca = True, pca_method = 'linear', verbose=False)
    for model in models:
      df = assign_model(create_model(model, num_clusters = c)).dropna()
      X = df.drop(['Cluster'], axis=1)
      labels = df['Cluster']
      s.append(silhouette_score(X , labels))
      s.append(calinski_harabasz_score(X, labels))
      s.append(davies_bouldin_score(X, labels))

    result_df[f"D{c}"] = s

  # Using Transformation + Normalzation
  for c in clusters:
    s = []
    setup(data = data, session_id = id , transformation = True, normalize = True,
    normalize_method = 'zscore', transformation_method = 'yeo-johnson', verbose=False)
    for model in models:
      df = assign_model(create_model(model, num_clusters = c)).dropna()
      X = df.drop(['Cluster'], axis=1)
      labels = df['Cluster']
      s.append(silhouette_score(X , labels))
      s.append(calinski_harabasz_score(X, labels))
      s.append(davies_bouldin_score(X, labels))

    result_df[f"E{c}"] = s

  # Using Transformation + Normalzation + PCA
  for c in clusters:
    s = []
    setup(data = data, session_id = id , transformation = True, normalize = True,
    normalize_method = 'zscore', transformation_method = 'yeo-johnson', pca = True, pca_method = 'linear' , verbose=False)
    for model in models:
      df = assign_model(create_model(model, num_clusters = c)).dropna()
      X = df.drop(['Cluster'], axis=1)
      labels = df['Cluster']
      s.append(silhouette_score(X , labels))
      s.append(calinski_harabasz_score(X, labels))
      s.append(davies_bouldin_score(X, labels))

    result_df[f"F{c}"] = s


In [None]:
models = ["kmeans" , "hclust" , "birch" , "optics"]
clusters = [3,4,5] # No. of clusters

In [None]:
evaluate(X , models , clusters) # data = X because clustering is mostly carried out for the features (and not the target variable)


In [None]:
result_df

Unnamed: 0,A3,A4,A5,B3,B4,B5,C3,C4,C5,D3,D4,D5,E3,E4,E5,F3,F4,F5
0,0.282925,0.280676,0.278541,0.045167,-0.009402,-0.020177,0.073549,0.043958,0.027326,0.282925,0.280676,0.278541,0.040871,0.005899,-0.057213,0.040871,0.005899,-0.057213
1,191.34589,173.751377,166.741409,22.960882,8.400353,10.91079,23.502512,17.204648,13.289174,191.34589,173.751377,166.741409,23.006961,13.417535,10.452849,23.006961,13.417535,10.452849
2,1.15152,1.063706,1.054508,3.337925,5.958973,8.243679,2.968424,3.839391,4.737603,1.15152,1.063706,1.054508,3.390921,4.938975,5.356865,3.390921,4.938975,5.356865
3,0.24222,0.25179,0.199588,0.027073,-0.0171,-0.024598,0.072676,0.040667,0.035639,0.24222,0.25179,0.199588,0.004718,-0.032182,-0.036049,0.004718,-0.032182,-0.036049
4,169.90264,151.224882,142.684686,10.94911,8.381465,7.861121,23.801669,17.355506,13.850153,169.90264,151.224882,142.684686,6.039781,6.669668,6.72192,6.039781,6.669668,6.72192
5,1.278892,1.148266,1.30012,5.309832,6.202113,5.629623,2.834257,4.215775,4.600962,1.278892,1.148266,1.30012,5.940081,7.28769,7.730522,5.940081,7.28769,7.730522
6,0.24222,0.25179,0.199588,0.014566,-0.034091,-0.048895,0.072676,0.040667,0.035639,0.24222,0.25179,0.199588,0.013285,-0.020089,-0.026549,0.013285,-0.020089,-0.026549
7,169.90264,151.224882,142.684686,8.582735,6.870633,6.959544,23.801669,17.355506,13.850153,169.90264,151.224882,142.684686,7.673619,6.67292,6.227608,7.673619,6.67292,6.227608
8,1.278892,1.148266,1.30012,5.575086,5.43595,4.953021,2.834257,4.215775,4.600962,1.278892,1.148266,1.30012,5.810771,7.287971,6.702888,5.810771,7.287971,6.702888
9,-0.350933,-0.350933,-0.350933,-0.289556,-0.289556,-0.289556,-0.337426,-0.337426,-0.337426,-0.350933,-0.350933,-0.350933,-0.328201,-0.328201,-0.328201,-0.328201,-0.328201,-0.328201


In [None]:
result_df.to_csv("result.csv" , index = False)

## Visualization