In [8]:
import os
os.environ["OMP_NUM_THREADS"] = "4"

In [9]:
%load_ext autoreload
%autoreload 2
%cd code

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[WinError 2] El sistema no puede encontrar el archivo especificado: 'code'
C:\Users\maja9\Documents\GitHub\Clustering-Algorithms\code


In [10]:
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score

def adjusted_rand_index(labels_true, labels_pred):
    return adjusted_rand_score(labels_true, labels_pred)

def purity_score(y_true, y_pred):
    # Compute contingency matrix
    contingency_matrix = pd.crosstab(y_true, y_pred)
    # Sum of maximum values in each column
    return np.sum(np.amax(contingency_matrix.values, axis=0)) / np.sum(contingency_matrix.values)

def davies_bouldin_index(data, labels):
    return davies_bouldin_score(data, labels)

def silhouette_coefficient(data, labels):
    return silhouette_score(data, labels)

def f_measure(labels_true, labels_pred):
    contingency_matrix = pd.crosstab(labels_true, labels_pred)
    precision = contingency_matrix.max(axis=0).sum() / len(labels_pred)
    recall = contingency_matrix.max(axis=1).sum() / len(labels_true)
    return 2 * (precision * recall) / (precision + recall)

def calinski_harabasz_index(data, labels):
    return calinski_harabasz_score(data, labels)


def get_metrics_general(X, labels_true, labels_pred, method, time, n_iterations = None):
    # Compute metrics
    dbi = davies_bouldin_index(X, labels_pred)
    silhouette = silhouette_coefficient(X, labels_pred)
    calinski = calinski_harabasz_index(X, labels_pred)

    ari = adjusted_rand_index(labels_true, labels_pred)
    purity = purity_score(labels_true, labels_pred)
    fmeasure = f_measure(labels_true, labels_pred)

    # Append results
    results = {
        "Method": method,
        "ARI": ari,
        "Purity": purity,
        "F-Measure": fmeasure,
        "Davies-Bouldin Index": dbi,
        "Silhouette Coefficient": silhouette,
        "Calinski": calinski,
        "Solving Time": time,
        "Iterations": n_iterations if n_iterations else np.NaN
    }
    return results


In [11]:
import time
import pandas as pd
import numpy as np
from sklearn.cluster import SpectralClustering

def run_all_spectral_clustering(data_X, data_y):
    results = []

    for k in range(2, 14):
        for affinity in ["rbf", "nearest_neighbors"]:
            for assign_labels in ['kmeans', 'discretize']:
                for eigen_solver in ['arpack', 'lobpcg']:

                    if affinity == "nearest_neighbors":
                        for n_neighbors in [25, 50, 100, 200]:

                            start_time = time.time()
                            spectral = SpectralClustering(
                                n_clusters=k,
                                affinity=affinity,
                                assign_labels=assign_labels,
                                eigen_solver=eigen_solver,
                                n_neighbors=n_neighbors,
                            )

                            labels = spectral.fit_predict(data_X)

                            end_time = time.time()
                            elapsed_time = end_time - start_time

                            method= f"spectral_k{k}affinity{affinity.replace('', '-')}assign_labels{assign_labels.replace('', '-')}eigen-solver{eigen_solver.replace('', '-')}"

                            results_spectral = get_metrics_general(data_X, data_y, labels, method, elapsed_time,n_iterations = None)

                            results.append(results_spectral)

                    else:
                        start_time = time.time()
                        spectral = SpectralClustering(
                            n_clusters=k,
                            affinity=affinity,
                            assign_labels=assign_labels,
                            eigen_solver=eigen_solver,
                        )

                        labels = spectral.fit_predict(data_X)

                        end_time = time.time()
                        elapsed_time = end_time - start_time

                        method= f"spectral_k{k}affinity{affinity.replace('', '-')}assign_labels{assign_labels.replace('', '-')}eigen-solver{eigen_solver.replace('', '-')}"

                        results_spectral = get_metrics_general(data_X, data_y, labels, method, elapsed_time,n_iterations = None)

                        results.append(results_spectral)

    results_df = pd.DataFrame(results)
    return results_df

In [12]:
df = pd.read_csv('../datasets_processed/grid.csv')
df_X = np.array(df[df.columns[:-1]])
df_y = np.array(df[df.columns[-1]])

resultados = run_all_spectral_clustering(df_X, df_y)

resultados.to_csv('../output/results_spectral_grid.csv')

In [13]:
df = pd.read_csv('../datasets_processed/sick.csv')
df_X = np.array(df[df.columns[:-1]])
df_y = np.array(df[df.columns[-1]])

resultados = run_all_spectral_clustering(df_X, df_y)

resultados.to_csv('../output/results_spectral_sick.csv')

In [14]:
df = pd.read_csv('../datasets_processed/vowel.csv')
df_X = np.array(df[df.columns[:-1]])
df_y = np.array(df[df.columns[-1]])

resultados = run_all_spectral_clustering(df_X, df_y)

resultados.to_csv('../output/results_spectral_vowel.csv')

[1.47463760e-15 1.45512889e-06 7.83352109e-07 8.30549700e-07
 1.03652188e-06 1.48727017e-06 3.31872421e-06 2.54245606e-06
 9.58694074e-06 1.00064103e-05]
not reaching the requested tolerance 1e-05.
Use iteration 19 instead with accuracy 
3.1047354018669226e-06.

  _, diffusion_map = lobpcg(
[4.50481312e-12 1.45512889e-06 7.83352109e-07 8.30549700e-07
 1.03652188e-06 1.48727017e-06 3.31872421e-06 2.54245606e-06
 9.58694074e-06 1.00064103e-05]
not reaching the requested tolerance 1e-05.
  _, diffusion_map = lobpcg(
