### Basic Configuration

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import calinski_harabasz_score
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

In [3]:
# Load the toy dataset
bc = load_breast_cancer()

# Create a DataFrame
bc_df = pd.DataFrame(data=bc.data, columns=bc.feature_names)

# Add the target variable (class) to the DataFrame
bc_df['target'] = bc.target

# Display the DataFrame
print(bc_df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

### Models

#### KMeans

In [4]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42, n_init="auto").fit(bc_df)
kmeans_labels = kmeans.labels_
true_labels = bc.target

# Evaluation Methods
silhouette_avg1 = silhouette_score(bc_df, kmeans_labels)
dbi_score1 = davies_bouldin_score(bc_df, kmeans_labels)
rand_score1 = adjusted_rand_score(true_labels, kmeans_labels)
ch_score1 = calinski_harabasz_score(bc_df, kmeans_labels)

print("KMeans")
print("Silhouette Score:", silhouette_avg1)
print("Davies-Bouldin Index:", dbi_score1)
print("Rand Score:", rand_score1)
print("Calinski and Harabasz Score:", ch_score1)

KMeans
Silhouette Score: 0.6467638371962151
Davies-Bouldin Index: 0.6329050458971559
Rand Score: 0.5390726905036369
Calinski and Harabasz Score: 1246.2597348622987


#### Affinity Propagation

In [9]:
from sklearn import metrics
from sklearn.cluster import AffinityPropagation

af = AffinityPropagation(preference=-50, random_state=0).fit(bc_df)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
labels_true = bc.target

n_clusters_ = len(cluster_centers_indices)

silhouette_avg1 = silhouette_score(bc_df, labels)
dbi_score1 = davies_bouldin_score(bc_df, labels)
rand_score1 = adjusted_rand_score(labels_true, labels)
ch_score1 = calinski_harabasz_score(bc_df, labels)

print("Affinity Propagation")
print("Silhouette Score:", silhouette_avg1)
print("Davies-Bouldin Index:", dbi_score1)
print("Rand Score:", rand_score1)
print("Calinski and Harabasz Score:", ch_score1)

Affinity Propagation
Silhouette Score: 0.029075299464434805
Davies-Bouldin Index: 0.09339040693413933
Rand Score: 0.00023987188987162304
Calinski and Harabasz Score: 15577.954135952754


#### DBScan

In [13]:
from sklearn.cluster import DBSCAN
from sklearn.discriminant_analysis import StandardScaler

x = StandardScaler().fit_transform(bc_df)
db = DBSCAN(eps=0.3, min_samples=10).fit(x)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 0
Estimated number of noise points: 569


#### MeanShift

In [14]:
from sklearn.cluster import MeanShift, estimate_bandwidth
x = StandardScaler().fit_transform(bc_df)
bandwidth = estimate_bandwidth(x, quantile=0.2, n_samples=500)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(x)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

number of estimated clusters : 1
