In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import calinski_harabasz_score
import pandas as pd
import numpy as np

In [4]:
from sklearn.datasets import load_diabetes

# Load the Diabetes dataset
diabetes = load_diabetes()

# Create a DataFrame
diabetes_df = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)

# Add the target variable (class) to the DataFrame
diabetes_df['target'] = diabetes.target

# True labels
true_labels = diabetes.target

# Display the DataFrame
print(diabetes_df.head())

        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  


In [5]:
# Using KMean and Silhoutte score
from sklearn.cluster import KMeans

# Kmean
kmeans = KMeans(n_clusters = 3, random_state = 20, n_init = "auto").fit(diabetes_df)

# Getting labels
kmeans_labels = kmeans.labels_

# Calculating silhouette score
silhouette_avg1 = silhouette_score(diabetes_df, kmeans_labels)

# Calculating DBI
dbi_score1 = davies_bouldin_score(diabetes_df, kmeans_labels)

# Calcualting rand score
rand_score1 = adjusted_rand_score(true_labels, kmeans_labels)

# Calculating calinksi and harabasz score
ch_score1 = calinski_harabasz_score(diabetes_df, kmeans_labels)

print("KMean")
print("Silhouette Score:", silhouette_avg1)
print("Davies-Bouldin Index:", dbi_score1)
print("Rand Score:", rand_score1)
print("Calinski and Harabasz Score:", ch_score1)

KMean
Silhouette Score: 0.6030231408737028
Davies-Bouldin Index: 0.4927745558025032
Rand Score: 0.014300943432801118
Calinski and Harabasz Score: 1719.5587209653474


In [9]:
# Using MeanShift and Davies bouldin index
from sklearn.cluster import MeanShift

# Mean shift
mean_shift = MeanShift().fit(diabetes_df)

# Getting labels
mean_shift_labels = mean_shift.labels_

# Calculating silhouette score
silhouette_avg2 = silhouette_score(diabetes_df, mean_shift_labels)

# Calculating DBI
dbi_score2 = davies_bouldin_score(diabetes_df, mean_shift_labels)

# Calcualting rand score
rand_score2 = adjusted_rand_score(true_labels, mean_shift_labels)

# Calculating calinksi and harabasz score
ch_score2 = calinski_harabasz_score(diabetes_df, mean_shift_labels)

print("Mean shift")
print("Silhouette Score:", silhouette_avg2)
print("Davies-Bouldin Index:", dbi_score2)
print("Rand Score:", rand_score2)
print("Calinski and Harabasz Score:", ch_score2)

Mean shift
Silhouette Score: 0.5846141063011262
Davies-Bouldin Index: 0.5054796457754716
Rand Score: 0.01458502610779577
Calinski and Harabasz Score: 1607.3872409066034


In [7]:
# Using agglomerative clustering and rand score
from sklearn.cluster import AgglomerativeClustering

# Agglomerative clustering
agglomerative_clustering = AgglomerativeClustering(n_clusters = 3).fit(diabetes_df)

# Getting clustered labels
agglomerative_labels = agglomerative_clustering.labels_

# Calculating silhouette score
silhouette_avg3 = silhouette_score(diabetes_df, agglomerative_labels)

# Calculating DBI
dbi_score3 = davies_bouldin_score(diabetes_df, agglomerative_labels)

# Calcualting rand score
rand_score3 = adjusted_rand_score(true_labels, agglomerative_labels)

# Calculating calinksi and harabasz score
ch_score3 = calinski_harabasz_score(diabetes_df, agglomerative_labels)

print("Agglomerative clustering")
print("Silhouette Score:", silhouette_avg3)
print("Davies-Bouldin Index:", dbi_score3)
print("Rand Score:", rand_score3)
print("Calinski and Harabasz Score:", ch_score3)

Agglomerative clustering
Silhouette Score: 0.5439283189634871
Davies-Bouldin Index: 0.4787328114192006
Rand Score: 0.009953764601743948
Calinski and Harabasz Score: 1173.9037953940565


In [10]:
# Using spectral clustering and calinski and harabasz score
from sklearn.cluster import SpectralClustering

# Spectral clustering
spectral_clustering = SpectralClustering(n_clusters = 3,random_state = 20).fit(diabetes_df)

# Getting labels
spectral_labels = spectral_clustering.labels_

# Calculating silhouette score
silhouette_avg4 = silhouette_score(diabetes_df, spectral_labels)

# Calculating DBI
dbi_score4 = davies_bouldin_score(diabetes_df, spectral_labels)

# Calcualting rand score
rand_score4 = adjusted_rand_score(true_labels, spectral_labels)

# Calculating calinksi and harabasz score
ch_score4 = calinski_harabasz_score(diabetes_df, spectral_labels)

print("Spectral clustering")
print("Silhouette Score:", silhouette_avg4)
print("Davies-Bouldin Index:", dbi_score4)
print("Rand Score:", rand_score4)
print("Calinski and Harabasz Score:", ch_score4)

Spectral clustering
Silhouette Score: 0.37002714205416143
Davies-Bouldin Index: 0.38052155617568
Rand Score: 0.00021143141355027764
Calinski and Harabasz Score: 17.904196239696834


[1.75441817e-13 2.93929128e-06 1.14962012e-05 4.80276754e-06]
not reaching the requested tolerance 6.586313247680664e-06.
Use iteration 662 instead with accuracy 
4.7059618646132595e-06.

  _, diffusion_map = lobpcg(
[7.67404325e-14 3.03374261e-06 4.78135198e-06 1.10087406e-05]
not reaching the requested tolerance 6.586313247680664e-06.
  _, diffusion_map = lobpcg(


Silhouette Score from best to worst
Kmeans > Mean shift > Agglomerative clustering > Spectral clustering

DBI from best to worst
Spectral clustering > Agglomerative clustering > Kmeans > Mean shift

Rand score from best to worst
Mean shift > Kmeans > Spectral clustering > Agglomerative clustering

Calinski and Harabasz score from best to worst
Kmeans > Mean shift > Agglomerative clustering > Spectral clustering
