In [21]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [9]:
def dunn_index(X, labels):
    distances = squareform(pdist(X))
    unique_cluster_labels = np.unique(labels)
    num_clusters = len(unique_cluster_labels)

    inter_cluster_distances = []
    for i in range(num_clusters):
        for j in range(i + 1, num_clusters):
            cluster_i = np.where(labels == unique_cluster_labels[i])[0]
            cluster_j = np.where(labels == unique_cluster_labels[j])[0]
            distances_ij = distances[np.ix_(cluster_i, cluster_j)]
            inter_cluster_distances.append(distances_ij.min())
    min_inter_cluster_distance = min(inter_cluster_distances)

    intra_cluster_distances = []
    for k in range(num_clusters):
        cluster_k = np.where(labels == unique_cluster_labels[k])[0]
        if len(cluster_k) > 1:
            distances_k = distances[np.ix_(cluster_k, cluster_k)]
            intra_cluster_distances.append(distances_k.max())
    max_intra_cluster_distance = max(intra_cluster_distances)

    dunn_index_value = min_inter_cluster_distance / max_intra_cluster_distance
    return dunn_index_value

In [10]:
def evaluate_clustering(X, labels):
    silhouette_avg = silhouette_score(X, labels)
    calinski_harabasz = calinski_harabasz_score(X, labels)
    dunn = dunn_index(X, labels)
    
    return {
        'Silhouette Score': silhouette_avg,
        'Calinski-Harabasz Index': calinski_harabasz,
        'Dunn Index': dunn
    }

In [4]:
# Generate synthetic data
X, labels_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# X: shape (300, 2)
# labels_true: shape (300,)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=0)
labels_pred = kmeans.fit_predict(X)
# labels_pred: shape (300,)

# Evaluate the clustering
metrics = evaluate_clustering(X, labels_pred)
for metric, value in metrics.items():
    print(f'{metric}: {value:.4f}')

  super()._check_params_vs_input(X, default_n_init=10)


Silhouette Score: 0.6820
Calinski-Harabasz Index: 1210.0899
Dunn Index: 0.2023


In [5]:
np.shape(X)

(300, 2)

In [6]:
np.shape(labels_pred)

(300,)

## CLUSTERING POST PROCESSING

In [17]:
df_2 = pd.read_csv("run1/2.csv")
outliers_2 = pd.read_csv("run1/outliers/2.csv")

In [14]:
df_2.head()

Unnamed: 0,2
0,[9.95109321e-03 1.06348378e-02 1.36709653e+04]
1,[1.03788855e-02 1.27749121e-02 1.27341877e+04]
2,[1.04487033e-02 1.49994626e-02 1.58932338e+04]
3,[1.06573846e-02 1.31959059e-02 1.51513243e+04]
4,[1.09326054e-02 1.43242784e-02 1.57785814e+04]


In [18]:
outliers_2.head()

Unnamed: 0,2
0,"{'r0': 0.3989002260753505, 'rc': 0.001, 'c': 1..."
1,"{'r0': 0.9021197851236678, 'rc': 0.001, 'c': 1..."
2,"{'r0': 0.001, 'rc': 0.001, 'c': 10.0}"
3,"{'r0': 0.001, 'rc': 0.001, 'c': 10.0}"
4,"{'r0': 0.001, 'rc': 0.001, 'c': 10.0}"


In [None]:
df_2 