In [16]:
import os
os.environ["OMP_NUM_THREADS"] = "4"

In [17]:
%load_ext autoreload
%autoreload 2
%cd code

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[WinError 2] El sistema no puede encontrar el archivo especificado: 'code'
C:\Users\maja9\Documents\GitHub\Clustering-Algorithms\code


In [19]:
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.metrics import davies_bouldin_score

def adjusted_rand_index(labels_true, labels_pred):
    return adjusted_rand_score(labels_true, labels_pred)

def purity_score(y_true, y_pred):
    # Compute contingency matrix
    contingency_matrix = pd.crosstab(y_true, y_pred)
    # Sum of maximum values in each column
    return np.sum(np.amax(contingency_matrix.values, axis=0)) / np.sum(contingency_matrix.values)

def davies_bouldin_index(data, labels):
    return davies_bouldin_score(data, labels)

def silhouette_coefficient(data, labels):
    return silhouette_score(data, labels)

def f_measure(labels_true, labels_pred):
    contingency_matrix = pd.crosstab(labels_true, labels_pred)
    precision = contingency_matrix.max(axis=0).sum() / len(labels_pred)
    recall = contingency_matrix.max(axis=1).sum() / len(labels_true)
    return 2 * (precision * recall) / (precision + recall)

def get_metrics(X, y, labels_pred, k, affinity, assign_labels, n_neighbors, eigen_solver):

    # Compute metrics
    dbi = davies_bouldin_index(X, labels_pred)
    silhouette = silhouette_coefficient(X, labels_pred)

    ari = adjusted_rand_index(y, labels_pred)
    purity = purity_score(y, labels_pred)
    fmeasure = f_measure(y, labels_pred)

    # Append results
    results = {
        "k": k,
        "affinity": affinity,
        "assign_labels": assign_labels,
        "n_neighbors": n_neighbors,
        "eigen_solver": eigen_solver,
        "ARI": ari,
        "Purity": purity,
        "F-Measure": fmeasure,
        "Davies-Bouldin Index": dbi,
        "Silhouette Coefficient": silhouette
    }
    return results

In [21]:
import pandas as pd
import numpy as np
from sklearn.cluster import SpectralClustering

def run_all_spectral_clustering(data_X, data_y):
    results = []

    for k in range(2, 8):
        for affinity in ["rbf", "nearest_neighbors"]:
            for assign_labels in ['kmeans', 'discretize']:
                for eigen_solver in ['arpack', 'lobpcg']:

                    if affinity == "nearest_neighbors":
                        #for n_neighbors in [5, 10, 15, 20]: #grid
                        #for n_neighbors in [30, 40, 50, 100]: #sick
                        for n_neighbors in [200]: # vowel
                            spectral = SpectralClustering(
                                n_clusters=k,
                                affinity=affinity,
                                assign_labels=assign_labels,
                                eigen_solver=eigen_solver,
                                n_neighbors=n_neighbors,
                            )

                            labels = spectral.fit_predict(data_X)

                            results_spectral = get_metrics(data_X, data_y, labels, k, affinity, assign_labels, n_neighbors , eigen_solver)
                            results.append(results_spectral)

                    else:
                        spectral = SpectralClustering(
                            n_clusters=k,
                            affinity=affinity,
                            assign_labels=assign_labels,
                            eigen_solver=eigen_solver,
                        )

                        labels = spectral.fit_predict(data_X)

                        results_spectral = get_metrics(data_X, data_y, labels, k, affinity, assign_labels, 'nan', eigen_solver)
                        results.append(results_spectral)

    results_df = pd.DataFrame(results)
    return results_df

In [24]:
df = pd.read_csv('../datasets_processed/grid.csv')
df_X = np.array(df[df.columns[:-1]])
df_y = np.array(df[df.columns[-1]])

resultados = run_all_spectral_clustering(df_X, df_y)

resultados.to_csv('../output/results_spectral_grid.csv')

In [25]:
df = pd.read_csv('../datasets_processed/sick.csv')
df_X = np.array(df[df.columns[:-1]])
df_y = np.array(df[df.columns[-1]])

resultados = run_all_spectral_clustering(df_X, df_y)

resultados.to_csv('../output/results_spectral_sick.csv')

In [26]:
df = pd.read_csv('../datasets_processed/vowel.csv')
df_X = np.array(df[df.columns[:-1]])
df_y = np.array(df[df.columns[-1]])

resultados = run_all_spectral_clustering(df_X, df_y)

resultados.to_csv('../output/results_spectral_vowel.csv')

