In [None]:
from cdsgd import DSClustering
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
#importamos los datasets clasico de sklearn
from sklearn.datasets import load_iris, load_digits, load_wine, load_breast_cancer, load_diabetes

In [None]:
def dunn_index(X, labels):
    unique_labels = np.unique(labels)
    k = len(unique_labels)
    
    # Calcular el diámetro de cada clúster
    diameters = []
    for label in unique_labels:
        cluster_points = X[labels == label]
        if len(cluster_points) > 1:
            diameters.append(np.max(cdist(cluster_points, cluster_points, metric='euclidean')))
        else:
            diameters.append(0)
    
    max_diameter = np.max(diameters)
    
    # Calcular la distancia mínima entre clusters
    min_distances = []
    for i in range(k):
        for j in range(i + 1, k):
            cluster_i_points = X[labels == unique_labels[i]]
            cluster_j_points = X[labels == unique_labels[j]]
            min_distance = np.min(cdist(cluster_i_points, cluster_j_points, metric='euclidean'))
            min_distances.append(min_distance)
    
    min_intercluster_distance = np.min(min_distances)
    
    # Índice de Dunn
    dunn_index_value = min_intercluster_distance / max_diameter
    
    return dunn_index_value

Experimento

In [None]:
atom = pd.read_csv('data/Atom_Data.csv')
atom_labels = pd.read_csv('data/Atom_Labels.csv')
chainlink = pd.read_csv('data/Chainlink_Data.csv')
chainlink_labels = pd.read_csv('data/Chainlink_Labels.csv')
engytime = pd.read_csv('data/EngyTime_Data.csv')
engytime_labels = pd.read_csv('data/EngyTime_Labels.csv')
hepta = pd.read_csv('data/Hepta_Data.csv')
hepta_labels = pd.read_csv('data/Hepta_Labels.csv')
tetra = pd.read_csv('data/Tetra_Data.csv')
tetra_labels = pd.read_csv('data/Tetra_Labels.csv')
target = pd.read_csv('data/Target_Data.csv')
target_labels = pd.read_csv('data/Target_Labels.csv')
two_diamonds = pd.read_csv('data/TwoDiamonds_Data.csv')
two_diamonds_labels = pd.read_csv('data/TwoDiamonds_Labels.csv')
wing_nut = pd.read_csv('data/WingNut_Data.csv')
wing_nut_labels = pd.read_csv('data/WingNut_Labels.csv')
# Cargamos los datasets clasico de sklearn
iris = load_iris()
iris_data = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_labels = pd.DataFrame(iris.target, columns=['target'])
digits = load_digits()
digits_data = pd.DataFrame(digits.data, columns=digits.feature_names)
digits_labels = pd.DataFrame(digits.target, columns=['target'])
wine = load_wine()
wine_data = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_labels = pd.DataFrame(wine.target, columns=['target'])
breast_cancer = load_breast_cancer()
breast_cancer_data = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)
breast_cancer_labels = pd.DataFrame(breast_cancer.target, columns=['target'])
datasets = [
    {
        'name': 'Atom',
        'data': atom,
        'labels': atom_labels,
        'n_clusters': atom_labels.nunique().values[0]
    },
    {
        'name': 'Chainlink',
        'data': chainlink,
        'labels': chainlink_labels,
        'n_clusters': chainlink_labels.nunique().values[0]
    },
    {
        'name': 'EngyTime',
        'data': engytime,
        'labels': engytime_labels,
        'n_clusters': engytime_labels.nunique().values[0]
    },
    {
        'name': 'Hepta',
        'data': hepta,
        'labels': hepta_labels,
        'n_clusters': hepta_labels.nunique().values[0]
    },
    {
        'name': 'Tetra',
        'data': tetra,
        'labels': tetra_labels,
        'n_clusters': tetra_labels.nunique().values[0]
    },
    {
        'name': 'Target',
        'data': target,
        'labels': target_labels,
        'n_clusters': target_labels.nunique().values[0]
    },
    {
        'name': 'TwoDiamonds',
        'data': two_diamonds,
        'labels': two_diamonds_labels,
        'n_clusters': two_diamonds_labels.nunique().values[0]
    },
    {
        'name': 'WingNut',
        'data': wing_nut,
        'labels': wing_nut_labels,
        'n_clusters': wing_nut_labels.nunique().values[0]
    },
    {
        'name': 'Iris',
        'data': iris_data,
        'labels': iris_labels,
        'n_clusters': 3
    },
    {
        'name': 'Digits',
        'data': digits_data,
        'labels': digits_labels,
        'n_clusters': 10
    },
    {
        'name': 'Wine',
        'data': wine_data,
        'labels': wine_labels,
        'n_clusters': 3
    },
    {
        'name': 'BreastCancer',
        'data': breast_cancer_data,
        'labels': breast_cancer_labels,
        'n_clusters': 2
    },
]
#Resultados
results_df = []

In [None]:
#Iteramos sobre los datasets
for dataset in datasets:
    n_clusters = dataset['n_clusters']
    data = dataset['data']

    #KMeans
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans_labels = kmeans.fit_predict(data)
    kmeans_silhouette = silhouette_score(data, kmeans_labels)
    kmeans_dunn = dunn_index(data, kmeans_labels)
    results_df = results_df + [{
        'Dataset': dataset['name'],
        'Algorithm': 'KMeans',
        'Silhouette': kmeans_silhouette,
        'Dunn': kmeans_dunn
    }]

    #DBSCAN
    dbscan = DBSCAN(eps=0.5)
    dbscan_labels = dbscan.fit_predict(data)
    if len(np.unique(dbscan_labels)) > 1:
        dbscan_silhouette = silhouette_score(data, dbscan_labels)
        dbscan_dunn = dunn_index(data, dbscan_labels)
    else:
        dbscan_silhouette = 0
        dbscan_dunn = 0
    results_df = results_df + [{
        'Dataset': dataset['name'],
        'Algorithm': 'DBSCAN',
        'Silhouette': dbscan_silhouette,
        'Dunn': dbscan_dunn
    }]

    #Agglomerative
    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    agglomerative_labels = agglomerative.fit_predict(data)
    agglomerative_silhouette = silhouette_score(data, agglomerative_labels)
    agglomerative_dunn = dunn_index(data, agglomerative_labels)
    results_df = results_df + [{
        'Dataset': dataset['name'],
        'Algorithm': 'Agglomerative',
        'Silhouette': agglomerative_silhouette,
        'Dunn': agglomerative_dunn
    }]

    #CDSDG
    cdsgd = DSClustering(data=data)
    cdsgd.generate_categorical_rules()
    cdsgd_labels = cdsgd.predict()
    if len(np.unique(cdsgd_labels)) > 1:
        cdsgd_silhouette = silhouette_score(data, cdsgd_labels)
        cdsgd_dunn = dunn_index(data, cdsgd_labels)
    else:
        cdsgd_silhouette = 0
        cdsgd_dunn = 0
    results_df = results_df + [{
        'Dataset': dataset['name'],
        'Algorithm': 'CDSDG Clustering',
        'Silhouette': cdsgd_silhouette,
        'Dunn': cdsgd_dunn
    }]

    #CDSDG mas votados
    cdsgd1 = DSClustering(data=data, most_voted=True)
    cdsgd1.generate_categorical_rules()
    cdsgd1_labels = cdsgd1.predict()
    if len(np.unique(cdsgd1_labels)) > 1:
        cdsgd1_silhouette = silhouette_score(data, cdsgd1_labels)
        cdsgd1_dunn = dunn_index(data, cdsgd1_labels)
    else:
        cdsgd1_silhouette = 0
        cdsgd1_dunn = 0
    results_df = results_df + [{
        'Dataset': dataset['name'],
        'Algorithm': 'CDSDG Voting',
        'Silhouette': cdsgd1_silhouette,
        'Dunn': cdsgd1_dunn
    }]

    # CDSDG con numero de clusters
    cdsgd2 = DSClustering(data=data, cluster=n_clusters)
    cdsgd2.generate_categorical_rules()
    cdsgd2_labels = cdsgd2.predict()
    if len(np.unique(cdsgd2_labels)) > 1:
        cdsgd2_silhouette = silhouette_score(data, cdsgd2_labels)
        cdsgd2_dunn = dunn_index(data, cdsgd2_labels)
    else:
        cdsgd2_silhouette = 0
        cdsgd2_dunn = 0
    results_df = results_df + [{
        'Dataset': dataset['name'],
        'Algorithm': 'CDSDG Clustering with n_clusters',
        'Silhouette': cdsgd2_silhouette,
        'Dunn': cdsgd2_dunn
    }]
    print("Dataset: ", dataset['name'])
    print("-----------------------------")

In [None]:
results_df = pd.DataFrame(results_df)
#save results
results_df.to_csv('results.csv', index=False)


In [16]:
results_df = pd.read_csv('results.csv')
# Mostar resultados en una tabla
for dataset in results_df['Dataset'].unique():
    dataset_results = results_df[results_df['Dataset'] == dataset].round(2)
    dataset_results = dataset_results.sort_values(by='Silhouette', ascending=False)
    print(dataset)
    print(dataset_results)
    print("------------------------------------------------")
    print("------------------------------------------------")

    

Atom
  Dataset                         Algorithm  Silhouette  Dunn
3    Atom                  CDSDG Clustering        0.48  0.02
4    Atom                      CDSDG Voting        0.48  0.02
0    Atom                            KMeans        0.42  0.04
5    Atom  CDSDG Clustering with n_clusters        0.42  0.03
2    Atom                     Agglomerative        0.41  0.09
1    Atom                            DBSCAN        0.00  0.00
------------------------------------------------
------------------------------------------------
Chainlink
      Dataset                         Algorithm  Silhouette  Dunn
6   Chainlink                            KMeans        0.36  0.01
11  Chainlink  CDSDG Clustering with n_clusters        0.36  0.01
9   Chainlink                  CDSDG Clustering        0.35  0.02
10  Chainlink                      CDSDG Voting        0.35  0.02
8   Chainlink                     Agglomerative        0.34  0.04
7   Chainlink                            DBSCAN        0.