In [None]:
from cdsgd import DSClustering
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
from scipy.spatial.distance import cdist
from scipy.stats import pearsonr
from sklearn.datasets import load_iris,load_wine
from sklearn.preprocessing import StandardScaler

In [None]:
def dunn_index(X, labels):
    unique_labels = np.unique(labels)
    k = len(unique_labels)
    
    # Calcular el diámetro de cada clúster
    diameters = []
    for label in unique_labels:
        cluster_points = X[labels == label]
        if len(cluster_points) > 1:
            diameters.append(np.max(cdist(cluster_points, cluster_points, metric='euclidean')))
        else:
            diameters.append(0)
    
    max_diameter = np.max(diameters)
    
    # Calcular la distancia mínima entre clusters
    min_distances = []
    for i in range(k):
        for j in range(i + 1, k):
            cluster_i_points = X[labels == unique_labels[i]]
            cluster_j_points = X[labels == unique_labels[j]]
            min_distance = np.min(cdist(cluster_i_points, cluster_j_points, metric='euclidean'))
            min_distances.append(min_distance)
    
    min_intercluster_distance = np.min(min_distances)
    
    # Índice de Dunn
    dunn_index_value = min_intercluster_distance / max_diameter
    
    return dunn_index_value

Experimento

In [None]:
atom = pd.read_csv('data/Atom_Data.csv')
atom_labels = pd.read_csv('data/Atom_Labels.csv')
chainlink = pd.read_csv('data/Chainlink_Data.csv')
chainlink_labels = pd.read_csv('data/Chainlink_Labels.csv')
engytime = pd.read_csv('data/EngyTime_Data.csv')
engytime_labels = pd.read_csv('data/EngyTime_Labels.csv')
hepta = pd.read_csv('data/Hepta_Data.csv')
hepta_labels = pd.read_csv('data/Hepta_Labels.csv')
tetra = pd.read_csv('data/Tetra_Data.csv')
tetra_labels = pd.read_csv('data/Tetra_Labels.csv')
target = pd.read_csv('data/Target_Data.csv')
target_labels = pd.read_csv('data/Target_Labels.csv')
two_diamonds = pd.read_csv('data/TwoDiamonds_Data.csv')
two_diamonds_labels = pd.read_csv('data/TwoDiamonds_Labels.csv')
wing_nut = pd.read_csv('data/WingNut_Data.csv')
wing_nut_labels = pd.read_csv('data/WingNut_Labels.csv')
# Cargamos los datasets clasico de sklearn
iris = load_iris()
iris_data = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_labels = pd.DataFrame(iris.target, columns=['target'])
wine = load_wine()
wine_data = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_labels = pd.DataFrame(wine.target, columns=['target'])
datasets = [
    {
        'name': 'Atom',
        'data': atom,
        'labels': atom_labels,
        'n_clusters': atom_labels.nunique().values[0]
    },{
        'name': 'Chainlink',
        'data': chainlink,
        'labels': chainlink_labels,
        'n_clusters': chainlink_labels.nunique().values[0]
    },{
        'name': 'EngyTime',
        'data': engytime,
        'labels': engytime_labels,
        'n_clusters': engytime_labels.nunique().values[0]
    },{
        'name': 'Hepta',
        'data': hepta,
        'labels': hepta_labels,
        'n_clusters': hepta_labels.nunique().values[0]
    },{
        'name': 'Tetra',
        'data': tetra,
        'labels': tetra_labels,
        'n_clusters': tetra_labels.nunique().values[0]
    },{
        'name': 'Target',
        'data': target,
        'labels': target_labels,
        'n_clusters': target_labels.nunique().values[0]
    },{
        'name': 'TwoDiamonds',
        'data': two_diamonds,
        'labels': two_diamonds_labels,
        'n_clusters': two_diamonds_labels.nunique().values[0]
    },{
        'name': 'WingNut',
        'data': wing_nut,
        'labels': wing_nut_labels,
        'n_clusters': wing_nut_labels.nunique().values[0]
    },{
        'name': 'Iris',
        'data': iris_data,
        'labels': iris_labels,
        'n_clusters': 3
    },{
        'name': 'Wine',
        'data': wine_data,
        'labels': wine_labels,
        'n_clusters': 3
    },
]
#Resultados
results_df = []

In [None]:
#Iteramos sobre los datasets
# testear al menos 10 casoss, evvaluar diferencias con Ricardo
# buscar definiciones de interpretaabilidad y como lo miden
for i in range(20):
    for dataset in datasets:
        n_clusters = dataset['n_clusters']
        data = dataset['data']
        labels = dataset['labels'].values.ravel()
        #normalizamos los datos
        scaler = StandardScaler()
        data = scaler.fit_transform(data)
        data = pd.DataFrame(data, columns=dataset['data'].columns)

        #KMeans
        kmeans = KMeans(n_clusters=n_clusters)
        kmeans_labels = kmeans.fit_predict(data)
        kmeans_silhouette = silhouette_score(data, kmeans_labels)
        kmeans_dunn = dunn_index(data, kmeans_labels)
        kmeans_rand = adjusted_rand_score(labels, kmeans_labels)
        kmeans_pearson = pearsonr(labels, kmeans_labels)
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'KMeans',
            'Silhouette': kmeans_silhouette,
            'Dunn': kmeans_dunn,
            'Rand': kmeans_rand,
            'Pearson': kmeans_pearson[0]
        }]

        #DBSCAN
        dbscan = DBSCAN(eps=0.5)
        dbscan_labels = dbscan.fit_predict(data)
        if len(np.unique(dbscan_labels)) > 1:
            dbscan_silhouette = silhouette_score(data, dbscan_labels)
            dbscan_dunn = dunn_index(data, dbscan_labels)
            dbscan_rand = adjusted_rand_score(labels, dbscan_labels)
            dbscan_pearson = pearsonr(labels, dbscan_labels)
        else:
            dbscan_silhouette = 0
            dbscan_dunn = 0
            dbscan_rand = 0
            dbscan_pearson = [0, 0]
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'DBSCAN',
            'Silhouette': dbscan_silhouette,
            'Dunn': dbscan_dunn,
            'Rand': dbscan_rand,
            'Pearson': dbscan_pearson[0]
        }]

        #Agglomerative
        agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
        agglomerative_labels = agglomerative.fit_predict(data)
        agglomerative_silhouette = silhouette_score(data, agglomerative_labels)
        agglomerative_dunn = dunn_index(data, agglomerative_labels)
        agglomerative_rand = adjusted_rand_score(labels, agglomerative_labels)
        agglomerative_pearson = pearsonr(labels, agglomerative_labels)
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'Agglomerative',
            'Silhouette': agglomerative_silhouette,
            'Dunn': agglomerative_dunn,
            'Rand': agglomerative_rand,
            'Pearson': agglomerative_pearson[0]
        }]

        #CDSDG
        cdsgd = DSClustering(data=data.copy())
        cdsgd.generate_categorical_rules()
        cdsgd_labels = cdsgd.predict()
        if len(np.unique(cdsgd_labels)) > 1:
            cdsgd_silhouette = silhouette_score(data, cdsgd_labels)
            cdsgd_dunn = dunn_index(data, cdsgd_labels)
            cdsgd_rand = adjusted_rand_score(labels, cdsgd_labels)
            cdsgd_pearson = pearsonr(labels, cdsgd_labels)
        else:
            cdsgd_silhouette = 0
            cdsgd_dunn = 0
            cdsgd_rand = 0
            cdsgd_pearson = [0, 0]
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'CDSDG Clustering',
            'Silhouette': cdsgd_silhouette,
            'Dunn': cdsgd_dunn,
            'Rand': cdsgd_rand,
            'Pearson': cdsgd_pearson[0]
        }]

        #CDSDG mas votados
        cdsgd1 = DSClustering(data=data.copy(), most_voted=True)
        cdsgd1.generate_categorical_rules()
        cdsgd1_labels = cdsgd1.predict()
        if len(np.unique(cdsgd1_labels)) > 1:
            cdsgd1_silhouette = silhouette_score(data, cdsgd1_labels)
            cdsgd1_dunn = dunn_index(data, cdsgd1_labels)
            cdsgd1_rand = adjusted_rand_score(labels, cdsgd1_labels)
            cdsgd1_pearson = pearsonr(labels, cdsgd1_labels)
        else:
            cdsgd1_silhouette = 0
            cdsgd1_dunn = 0
            cdsgd1_rand = 0
            cdsgd1_pearson = [0, 0]
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'CDSDG Voting',
            'Silhouette': cdsgd1_silhouette,
            'Dunn': cdsgd1_dunn,
            'Rand': cdsgd1_rand,
            'Pearson': cdsgd1_pearson[0],
        }]

        # CDSDG con numero de clusters
        cdsgd2 = DSClustering(data=data.copy(), cluster=n_clusters)
        cdsgd2.generate_categorical_rules()
        cdsgd2_labels = cdsgd2.predict()
        if len(np.unique(cdsgd2_labels)) > 1:
            cdsgd2_silhouette = silhouette_score(data, cdsgd2_labels)
            cdsgd2_dunn = dunn_index(data, cdsgd2_labels)
            cdsgd2_rand = adjusted_rand_score(labels, cdsgd2_labels)
            cdsgd2_pearson = pearsonr(labels, cdsgd2_labels)
        else:
            cdsgd2_silhouette = 0
            cdsgd2_dunn = 0
            cdsgd2_rand = 0
            cdsgd2_pearson = [0, 0]
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'CDSDG Clustering with n_clusters',
            'Silhouette': cdsgd2_silhouette,
            'Dunn': cdsgd2_dunn,
            'Rand': cdsgd2_rand,
            'Pearson': cdsgd2_pearson[0]
        }]
        print("Dataset: ", dataset['name'])
        print("-----------------------------")
        print("KMeans: ", kmeans_silhouette, kmeans_dunn, kmeans_rand, kmeans_pearson)
        print("DBSCAN: ", dbscan_silhouette, dbscan_dunn, dbscan_rand, dbscan_pearson)
        print("Agglomerative: ", agglomerative_silhouette, agglomerative_dunn, agglomerative_rand, agglomerative_pearson)
        print("CDSDG Clustering: ", cdsgd_silhouette, cdsgd_dunn, cdsgd_rand, cdsgd_pearson)
        print("CDSDG Voting: ", cdsgd1_silhouette, cdsgd1_dunn, cdsgd1_rand, cdsgd1_pearson)
        print("CDSDG Clustering with n_clusters: ", cdsgd2_silhouette, cdsgd2_dunn, cdsgd2_rand, cdsgd2_pearson)
    #save results
    results = pd.DataFrame(results_df)
    results.to_csv('results'+str(i)+'.csv', index=False)
    results_df = []
    

In [8]:
#evaluamos los resultados
for i in range(20):
    results = pd.read_csv('results'+str(i)+'.csv')
    for dataset in results['Dataset'].unique():
        dataset_results = results[results['Dataset'] == dataset]
        print("Dataset: ", dataset)
        print("-----------------------------")
        print(dataset_results)
        print("-----------------------------")

Dataset:  Atom
-----------------------------
  Dataset                         Algorithm  Silhouette      Dunn      Rand  \
0    Atom                            KMeans    0.389511  0.044807  0.147562   
1    Atom                            DBSCAN    0.460870  0.018507  0.567669   
2    Atom                     Agglomerative    0.383773  0.068420  0.067072   
3    Atom                  CDSDG Clustering    0.453309  0.036231  0.487792   
4    Atom                      CDSDG Voting    0.453309  0.036231  0.487792   
5    Atom  CDSDG Clustering with n_clusters    0.388044  0.027397  0.171543   

    Pearson  
0  0.488252  
1  0.915116  
2 -0.386556  
3 -0.297969  
4 -0.297969  
5  0.511693  
-----------------------------
Dataset:  Chainlink
-----------------------------
      Dataset                         Algorithm  Silhouette      Dunn  \
6   Chainlink                            KMeans    0.283379  0.016326   
7   Chainlink                            DBSCAN    0.153636  0.220721   
8   