In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "4"

In [2]:
%load_ext autoreload
%autoreload 2
%cd code

C:\Users\maja9\Documents\GitHub\Clustering-Algorithms\code


In [3]:
import pandas as pd
import numpy as np

from sklearn.neighbors import kneighbors_graph
from sklearn.cluster import SpectralClustering


import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.metrics import davies_bouldin_score

def adjusted_rand_index(labels_true, labels_pred):
    return adjusted_rand_score(labels_true, labels_pred)

def purity_score(y_true, y_pred):
    # Compute contingency matrix
    contingency_matrix = pd.crosstab(y_true, y_pred)
    # Sum of maximum values in each column
    return np.sum(np.amax(contingency_matrix.values, axis=0)) / np.sum(contingency_matrix.values)

def davies_bouldin_index(data, labels):
    return davies_bouldin_score(data, labels)

def silhouette_coefficient(data, labels):
    return silhouette_score(data, labels)

def f_measure(labels_true, labels_pred):
    contingency_matrix = pd.crosstab(labels_true, labels_pred)
    precision = contingency_matrix.max(axis=0).sum() / len(labels_pred)
    recall = contingency_matrix.max(axis=1).sum() / len(labels_true)
    return 2 * (precision * recall) / (precision + recall)

def get_metrics(X, y, labels_pred, k, affinity, assign_labels, n_neighbors, eigen_solver):

    # Compute metrics
    dbi = davies_bouldin_index(X, labels_pred)
    silhouette = silhouette_coefficient(X, labels_pred)

    ari = adjusted_rand_index(y, labels_pred)
    purity = purity_score(y, labels_pred)
    fmeasure = f_measure(y, labels_pred)

    # Append results
    results = {
        "k": k,
        "affinity": affinity,
        "assign_labels": assign_labels,
        "n_neighbors": n_neighbors,
        "eigen_solver": eigen_solver,
        "ARI": ari,
        "Purity": purity,
        "F-Measure": fmeasure,
        "Davies-Bouldin Index": dbi,
        "Silhouette Coefficient": silhouette
    }
    return results

In [5]:
import pandas as pd
import numpy as np
from sklearn.cluster import SpectralClustering
import matplotlib.pyplot as plt

def run_all_spectral_clustering(data_X, data_y):
    results = []

    for k in range(2, 8):
        for affinity in ["rbf", "nearest_neighbors"]:
            for assign_labels in ['kmeans', 'discretize']:
                for eigen_solver in ['arpack', 'lobpcg']:

                    if affinity == "nearest_neighbors":
                        for n_neighbors in [5, 10, 15, 20]: #grid
                        #for n_neighbors in [30, 40, 50, 100]: #sick
                        #for n_neighbors in [200]: # vowel
                            spectral = SpectralClustering(
                                n_clusters=k,
                                affinity=affinity,
                                assign_labels=assign_labels,
                                eigen_solver=eigen_solver,
                                n_neighbors=n_neighbors,
                                random_state=42
                            )

                            try:
                                labels = spectral.fit_predict(data_X)
                            except Exception as e:
                                print(f"Error en combinación k={k}, affinity={affinity}, n_neighbors={n_neighbors}, assign_labels={assign_labels}, eigen_solver={eigen_solver}: {e}")
                                continue


                            results_spectral = get_metrics(data_X, data_y, labels, k, affinity, assign_labels, n_neighbors , eigen_solver)
                            results.append(results_spectral)

                            # plt.figure(figsize=(8, 6))
                            # scatter = plt.scatter(data_X[:, 0], data_X[:, 1], c=labels, cmap='viridis', s=50)
                            # plt.title(f'Spectral Clustering (k={k}, Affinity={affinity}, n_neighbors={n_neighbors} assign_labels={assign_labels}, eigen_solver={eigen_solver})')
                            # plt.xlabel('Feature 1')
                            # plt.ylabel('Feature 2')

                            # legend_labels = [f"Cluster {i}" for i in np.unique(labels)]
                            # handles, _ = scatter.legend_elements()
                            # plt.legend(handles, legend_labels, title="Clusters")

                            #plt.show()


                    else:
                        spectral = SpectralClustering(
                            n_clusters=k,
                            affinity=affinity,
                            assign_labels=assign_labels,
                            eigen_solver=eigen_solver,
                            random_state=42
                        )

                        try:
                            labels = spectral.fit_predict(data_X)
                        except Exception as e:
                            print(f"Error en combinación k={k}, affinity={affinity}, assign_labels={assign_labels}, eigen_solver={eigen_solver}: {e}")
                            continue


                        results_spectral = get_metrics(data_X, data_y, labels, k, affinity, assign_labels, 'nan', eigen_solver)
                        results.append(results_spectral)

                        # plt.figure(figsize=(8, 6))
                        # scatter = plt.scatter(data_X[:, 0], data_X[:, 1], c=labels, cmap='viridis', s=50)
                        # plt.title(f'Spectral Clustering (k={k}, Affinity={affinity} assign_labels={assign_labels}, eigen_solver={eigen_solver})')
                        # plt.xlabel('Feature 1')
                        # plt.ylabel('Feature 2')

                        # legend_labels = [f"Cluster {i}" for i in np.unique(labels)]
                        # handles, _ = scatter.legend_elements()
                        # plt.legend(handles, legend_labels, title="Clusters")

                        #plt.show()



    results_df = pd.DataFrame(results)
    return results_df

In [6]:
df = pd.read_csv('../datasets_processed/grid.csv')
df_X = np.array(df[df.columns[:-1]])
df_y = np.array(df[df.columns[-1]])

resultados = run_all_spectral_clustering(df_X, df_y)

resultados.to_csv('../output/results_spectral_grid.csv')

In [7]:
df = pd.read_csv('../datasets_processed/sick.csv')
df_X = np.array(df[df.columns[:-1]])
df_y = np.array(df[df.columns[-1]])

resultados = run_all_spectral_clustering(df_X, df_y)

resultados.to_csv('../output/results_spectral_sick.csv')

[4.71296956e-14 7.05227023e-06 8.30182003e-06 8.80697813e-06
 1.01368458e-05]
not reaching the requested tolerance 1e-05.
  _, diffusion_map = lobpcg(
[4.71296956e-14 7.05227023e-06 8.30182003e-06 8.80697813e-06
 1.01368458e-05]
not reaching the requested tolerance 1e-05.
  _, diffusion_map = lobpcg(
[3.32908889e-14 6.96368701e-06 4.94249234e-06 7.64645341e-06
 1.25402305e-05 8.73297520e-06]
not reaching the requested tolerance 1e-05.
Use iteration 361 instead with accuracy 
6.799554574429118e-06.

  _, diffusion_map = lobpcg(
[2.02221396e-14 6.94469049e-06 4.94427641e-06 7.64888243e-06
 1.10708017e-05 1.01886715e-05]
not reaching the requested tolerance 1e-05.
  _, diffusion_map = lobpcg(
[3.32908889e-14 6.96368701e-06 4.94249234e-06 7.64645341e-06
 1.25402305e-05 8.73297520e-06]
not reaching the requested tolerance 1e-05.
Use iteration 361 instead with accuracy 
6.799554574429118e-06.

  _, diffusion_map = lobpcg(
[2.02221396e-14 6.94469049e-06 4.94427641e-06 7.64888243e-06
 1.107080

In [10]:
df = pd.read_csv('../datasets_processed/vowel.csv')
df_X = np.array(df[df.columns[:-1]])
df_y = np.array(df[df.columns[-1]])

resultados = run_all_spectral_clustering(df_X, df_y)

resultados.to_csv('../output/results_spectral_vowel.csv')

[8.03850917e-14 6.16667380e-06 1.08716316e-05 7.18521745e-06
 1.09312198e-05 9.60157122e-06]
not reaching the requested tolerance 1e-05.
Use iteration 63 instead with accuracy 
7.425390436753336e-06.

  _, diffusion_map = lobpcg(
[2.23856333e-13 6.17595494e-06 1.08790087e-05 6.98728436e-06
 8.82652179e-06 1.16835703e-05]
not reaching the requested tolerance 1e-05.
  _, diffusion_map = lobpcg(
[1.25079424e-12 4.29937458e-06 6.10177405e-06 8.90234151e-06
 1.03863005e-05 9.20606480e-06]
not reaching the requested tolerance 1e-05.
Use iteration 24 instead with accuracy 
6.482642776040287e-06.

  _, diffusion_map = lobpcg(
[1.54871537e-12 4.29937130e-06 6.10177187e-06 8.90234852e-06
 1.03862971e-05 9.20606481e-06]
not reaching the requested tolerance 1e-05.
  _, diffusion_map = lobpcg(
[8.03850917e-14 6.16667380e-06 1.08716316e-05 7.18521745e-06
 1.09312198e-05 9.60157122e-06]
not reaching the requested tolerance 1e-05.
Use iteration 63 instead with accuracy 
7.425390436753336e-06.

  _, di