In [5]:
from dsgd import DSClassifierMultiQ
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
from scipy.spatial.distance import cdist
from scipy.stats import pearsonr
from sklearn.datasets import load_iris,load_wine
from sklearn.preprocessing import StandardScaler

In [6]:
def dunn_index(X, labels):
    unique_labels = np.unique(labels)
    k = len(unique_labels)
    
    # Calcular el diámetro de cada clúster
    diameters = []
    for label in unique_labels:
        cluster_points = X[labels == label]
        if len(cluster_points) > 1:
            diameters.append(np.max(cdist(cluster_points, cluster_points, metric='euclidean')))
        else:
            diameters.append(0)
    
    max_diameter = np.max(diameters)
    
    # Calcular la distancia mínima entre clusters
    min_distances = []
    for i in range(k):
        for j in range(i + 1, k):
            cluster_i_points = X[labels == unique_labels[i]]
            cluster_j_points = X[labels == unique_labels[j]]
            min_distance = np.min(cdist(cluster_i_points, cluster_j_points, metric='euclidean'))
            min_distances.append(min_distance)
    
    min_intercluster_distance = np.min(min_distances)
    
    # Índice de Dunn
    dunn_index_value = min_intercluster_distance / max_diameter
    
    return dunn_index_value

Experimento

In [7]:
atom = pd.read_csv('data/Atom_Data.csv')
atom_labels = pd.read_csv('data/Atom_Labels.csv')
chainlink = pd.read_csv('data/Chainlink_Data.csv')
chainlink_labels = pd.read_csv('data/Chainlink_Labels.csv')
engytime = pd.read_csv('data/EngyTime_Data.csv')
engytime_labels = pd.read_csv('data/EngyTime_Labels.csv')
hepta = pd.read_csv('data/Hepta_Data.csv')
hepta_labels = pd.read_csv('data/Hepta_Labels.csv')
tetra = pd.read_csv('data/Tetra_Data.csv')
tetra_labels = pd.read_csv('data/Tetra_Labels.csv')
target = pd.read_csv('data/Target_Data.csv')
target_labels = pd.read_csv('data/Target_Labels.csv')
two_diamonds = pd.read_csv('data/TwoDiamonds_Data.csv')
two_diamonds_labels = pd.read_csv('data/TwoDiamonds_Labels.csv')
wing_nut = pd.read_csv('data/WingNut_Data.csv')
wing_nut_labels = pd.read_csv('data/WingNut_Labels.csv')
# Cargamos los datasets clasico de sklearn
iris = load_iris()
iris_data = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_labels = pd.DataFrame(iris.target, columns=['target'])
wine = load_wine()
wine_data = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_labels = pd.DataFrame(wine.target, columns=['target'])
# Cargamos los datasets de prueba de la tesis
uniform = pd.read_csv('data/uniform_df.csv')
uniform_data = uniform.drop(columns=['labels'])
uniform_labels = pd.DataFrame(uniform['labels'])
rectangle = pd.read_csv('data/rectangle_df.csv')
rectangle_data = rectangle.drop(columns=['labels'])
rectangle_labels = pd.DataFrame(rectangle['labels'])
gaussian = pd.read_csv('data/gaussian_df.csv')
gaussian_data = gaussian.drop(columns=['labels'])
gaussian_labels = pd.DataFrame(gaussian['labels'])
gaussian_mix = pd.read_csv('data/gaussian_mix_df.csv')
gaussian_mix_data = gaussian_mix.drop(columns=['labels'])
gaussian_mix_labels = pd.DataFrame(gaussian_mix['labels'])
datasets = [
    {
        'name': 'Atom',
        'data': atom,
        'labels': atom_labels,
        'n_clusters': atom_labels.nunique().values[0]
    },{
        'name': 'Chainlink',
        'data': chainlink,
        'labels': chainlink_labels,
        'n_clusters': chainlink_labels.nunique().values[0]
    },{
        'name': 'EngyTime',
        'data': engytime,
        'labels': engytime_labels,
        'n_clusters': engytime_labels.nunique().values[0]
    },{
        'name': 'Hepta',
        'data': hepta,
        'labels': hepta_labels,
        'n_clusters': hepta_labels.nunique().values[0]
    },{
        'name': 'Tetra',
        'data': tetra,
        'labels': tetra_labels,
        'n_clusters': tetra_labels.nunique().values[0]
    },{
        'name': 'Target',
        'data': target,
        'labels': target_labels,
        'n_clusters': target_labels.nunique().values[0]
    },{
        'name': 'TwoDiamonds',
        'data': two_diamonds,
        'labels': two_diamonds_labels,
        'n_clusters': two_diamonds_labels.nunique().values[0]
    },{
        'name': 'WingNut',
        'data': wing_nut,
        'labels': wing_nut_labels,
        'n_clusters': wing_nut_labels.nunique().values[0]
    },{
        'name': 'Iris',
        'data': iris_data,
        'labels': iris_labels,
        'n_clusters': 3
    },{
        'name': 'Wine',
        'data': wine_data,
        'labels': wine_labels,
        'n_clusters': 3
    },{
        'name': 'Uniform',
        'data': uniform_data,
        'labels': uniform_labels,
        'n_clusters': uniform_labels.nunique().values[0]
    },{
        'name': 'Rectangle',
        'data': rectangle_data,
        'labels': rectangle_labels,
        'n_clusters': rectangle_labels.nunique().values[0]
    },{
        'name': 'Gaussian',
        'data': gaussian_data,
        'labels': gaussian_labels,
        'n_clusters': gaussian_labels.nunique().values[0]
    },{
        'name': 'GaussianMix',
        'data': gaussian_mix_data,
        'labels': gaussian_mix_labels,
        'n_clusters': gaussian_mix_labels.nunique().values[0]
    }
]
#Resultados
results_df = []

In [8]:
import json
# Asegura que el directorio exista antes de guardar el archivo
import os
os.makedirs("results_centroid", exist_ok=True)

def normalize_clusters(labels1, labels2):
    # Create a mapping from labels1 to labels2
    unique_labels1 = np.unique(labels1)
    unique_labels2 = np.unique(labels2)
    
    mapping = {}
    for label1 in unique_labels1:
        # Find the most common label2 for each label1
        mask = (labels1 == label1)
        most_common_label2 = pd.Series(labels2[mask]).mode()[0]
        mapping[label1] = most_common_label2
    
    # Apply the mapping to labels1
    normalized_labels1 = np.array([mapping[label] for label in labels1])
    
    return normalized_labels1  

def compare_clusters(labels1, labels2):
    # Create a contingency table
    contingency_table = pd.crosstab(labels1, labels2)
    print("Contingency Table:")
    print(contingency_table)

    # Calculate the Pearson correlation coefficient
    corr, _ = pearsonr(labels1, labels2)
    print(f"Pearson correlation coefficient: {corr}")

#Iteramos sobre los datasets
# testear al menos 10 casoss, evaluar diferencias con Ricardo
# buscar definiciones de interpretaabilidad y como lo miden
for i in range(20):
    for dataset in datasets:
        print(f"Processing dataset: {dataset['name']} ({i+1}/20)")
        
        n_clusters = dataset['n_clusters']
        data = dataset['data']
        labels = dataset['labels'].values.ravel()
        #KMeans
        kmeans = KMeans(n_clusters=n_clusters)
        kmeans_labels = kmeans.fit_predict(data)
        kmeans_silhouette = silhouette_score(data, kmeans_labels)
        kmeans_dunn = dunn_index(data, kmeans_labels)
        kmeans_rand = adjusted_rand_score(labels, kmeans_labels)
        kmeans_pearson = pearsonr(labels, kmeans_labels)
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'KMeans',
            'Silhouette': kmeans_silhouette,
            'Dunn': kmeans_dunn,
            'Rand': kmeans_rand,
            'Pearson': kmeans_pearson[0]
        }]

        #DBSCAN
        dbscan = DBSCAN(eps=0.5)
        dbscan_labels = dbscan.fit_predict(data)
        if len(np.unique(dbscan_labels)) > 1:
            dbscan_silhouette = silhouette_score(data, dbscan_labels)
            dbscan_dunn = dunn_index(data, dbscan_labels)
            dbscan_rand = adjusted_rand_score(labels, dbscan_labels)
            dbscan_pearson = pearsonr(labels, dbscan_labels)
        else:
            dbscan_silhouette = 0
            dbscan_dunn = 0
            dbscan_rand = 0
            dbscan_pearson = [0, 0]
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'DBSCAN',
            'Silhouette': dbscan_silhouette,
            'Dunn': dbscan_dunn,
            'Rand': dbscan_rand,
            'Pearson': dbscan_pearson[0]
        }]

        #Agglomerative
        agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
        agglomerative_labels = agglomerative.fit_predict(data)
        agglomerative_silhouette = silhouette_score(data, agglomerative_labels)
        agglomerative_dunn = dunn_index(data, agglomerative_labels)
        agglomerative_rand = adjusted_rand_score(labels, agglomerative_labels)
        agglomerative_pearson = pearsonr(labels, agglomerative_labels)
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'Agglomerative',
            'Silhouette': agglomerative_silhouette,
            'Dunn': agglomerative_dunn,
            'Rand': agglomerative_rand,
            'Pearson': agglomerative_pearson[0]
        }]
        
        #DSC de Kmeans
        X_values = data.values
        DSC = DSClassifierMultiQ(n_clusters, min_iter=20, max_iter=200, debug_mode=True, lossfn="MSE", num_workers=0, min_dloss=1e-7, precompute_rules=True)
        losses, epoch, dt = DSC.fit(X_values, kmeans_labels, add_single_rules=True, single_rules_breaks=3, add_mult_rules=False, column_names=data.columns, print_every_epochs=1,)
        #calculamos el silhouette score con las predicciones de DSC
        predictions = DSC.predict(X_values)
        dsc_silhouette = silhouette_score(X_values, predictions)
        dsc_dunn = dunn_index(X_values, predictions)
        dsc_ari = adjusted_rand_score(labels, predictions)
        dsc_pearson = pearsonr(labels, predictions)
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'DSC',
            'Silhouette': dsc_silhouette,
            'Dunn': dsc_dunn,
            'Rand': dsc_ari,
            'Pearson': dsc_pearson[0]
        }]
        
        """ #Combinamos los labels de KMeans, Agglomerative y DBSCAN, este ultimo solo en caso de que tenga el mismo numero de clusters, para esto usamos la normalizacion de etiquetas
        #print("Comparing clusters...")
        
        #compare_clusters(kmeans_labels, agglomerative_labels)
        normalized_kmeans_labels = normalize_clusters(kmeans_labels, agglomerative_labels)
        #compare_clusters(normalized_kmeans_labels, agglomerative_labels)
        combined_labels = np.concatenate((normalized_kmeans_labels, agglomerative_labels), axis=0)
        X_combined = np.concatenate((X_values, X_values), axis=0)
        # Entrenamos DSC con los labels combinados
        DSC_combined = DSClassifierMultiQ(n_clusters, min_iter=20, max_iter=200, debug_mode=True, lossfn="MSE", num_workers=0, min_dloss=1e-7, precompute_rules=True)
        losses, epoch, dt = DSC_combined.fit(X_combined, combined_labels, add_single_rules=True, single_rules_breaks=3, add_mult_rules=False, column_names=data.columns, print_every_epochs=1,)
        #calculamos el silhouette score con las predicciones de DSC
        predictions_combined = DSC_combined.predict(X_combined)
        dsc_combined_silhouette = silhouette_score(X_combined, predictions_combined)
        dsc_combined_dunn = dunn_index(X_combined, predictions_combined)
        dsc_combined_ari = adjusted_rand_score(combined_labels, predictions_combined)
        dsc_combined_pearson = pearsonr(combined_labels, predictions_combined)
        results_df = results_df + [{
            'Dataset': dataset['name'],
            'Algorithm': 'DSC_Combined',
            'Silhouette': dsc_combined_silhouette,
            'Dunn': dsc_combined_dunn,
            'Rand': dsc_combined_ari,
            'Pearson': dsc_combined_pearson[0]
        }] """
        
        # Print results for the current dataset
        print("Dataset: ", dataset['name'])
        print("-----------------------------")
        print("KMeans: ", kmeans_silhouette, kmeans_dunn, kmeans_rand, kmeans_pearson[0])
        print("DBSCAN: ", dbscan_silhouette, dbscan_dunn, dbscan_rand, dbscan_pearson[0])
        print("Agglomerative: ", agglomerative_silhouette, agglomerative_dunn, agglomerative_rand, agglomerative_pearson[0])
        print("DSC: ", dsc_silhouette, dsc_dunn, dsc_ari, dsc_pearson[0])
        #print("DSC Combined: ", dsc_combined_silhouette, dsc_combined_dunn, dsc_combined_ari, dsc_combined_pearson[0])
        # Guarda los centroides de KMeans en un diccionario
        kmeans_centroids = []
        for idx, centroid in enumerate(kmeans.cluster_centers_):
            centroid_dict = {feature_name: float(value) for feature_name, value in zip(data.columns, centroid)}
            pred, cls, df_rls, builder = DSC.predict_explain(centroid)
            kmeans_centroids.append({"centroid": idx, "values": centroid_dict, "prediction": pred, "class": cls, "rules": df_rls.to_dict(orient='records')})
            
        # Obtiene las reglas más importantes de DSC
        rules = DSC.find_most_important_rules(threshold=0.01)

        
        # Guarda la información en un diccionario y luego en un archivo JSON
        info_dict = {
            "dataset": dataset['name'],
            "kmeans_centroids": kmeans_centroids,
            "dsc_rules": rules
        }
        with open(f"results_centroid/info_{dataset['name']}_{i}.json", "w") as f:
            json.dump(info_dict, f, indent=4, default=str)
        print("-----------------------------")
    #save results
    results = pd.DataFrame(results_df)
    results.to_csv('results_centroid/results'+str(i)+'.csv', index=False)
    results_df = []
    

Processing dataset: Atom (1/20)
Optimization started
Processing epoch	200	0.0337	
Training time: 27.45s, epochs: 200

Least training loss reached: 0.033
Dataset:  Atom
-----------------------------
KMeans:  0.4174755342972209 0.03231027991438114 0.19511197741600156 -0.5330189014262895
DBSCAN:  0 0 0 0
Agglomerative:  0.41073709133177916 0.08623451728951541 0.09862621818643041 -0.4323697725073315
DSC:  0.4136409972773324 0.040079469474647786 0.15932661957011873 -0.49999999999999994
-----------------------------
Processing dataset: Chainlink (1/20)
Optimization started
Processing epoch	200	0.0247	
Training time: 34.03s, epochs: 200

Least training loss reached: 0.024
Dataset:  Chainlink
-----------------------------
KMeans:  0.3573143551526814 0.018168506721606987 0.0939570501002004 -0.30800000000000005
DBSCAN:  0.16665067817267834 0.37189737642279397 1.0 1.0
Agglomerative:  0.3399813559048249 0.03920914359049763 0.35706442581215536 -0.6530953403124256
DSC:  0.3572088830888027 0.01816850

In [9]:
#evaluamos los resultados
for i in range(20):
    results = pd.read_csv('results/results'+str(i)+'.csv')
    for dataset in results['Dataset'].unique():
        dataset_results = results[results['Dataset'] == dataset]
        print("Dataset: ", dataset)
        print("-----------------------------")
        print(dataset_results)
        print("-----------------------------")

Dataset:  Atom
-----------------------------
  Dataset                         Algorithm  Silhouette      Dunn      Rand  \
0    Atom                            KMeans    0.389760  0.051867  0.149491   
1    Atom                            DBSCAN    0.460870  0.018507  0.567669   
2    Atom                     Agglomerative    0.383773  0.068420  0.067072   
3    Atom                  CDSDG Clustering    0.453309  0.036231  0.487792   
4    Atom                      CDSDG Voting    0.453309  0.036231  0.487792   
5    Atom  CDSDG Clustering with n_clusters    0.388044  0.027397  0.171543   

    Pearson  
0 -0.490214  
1  0.915116  
2 -0.386556  
3 -0.297969  
4 -0.297969  
5  0.511693  
-----------------------------
Dataset:  Chainlink
-----------------------------
      Dataset                         Algorithm  Silhouette      Dunn  \
6   Chainlink                            KMeans    0.282240  0.017234   
7   Chainlink                            DBSCAN    0.153636  0.220721   
8   