# HDBSCAN clustering result analysis

Let's start by loading up some libraries and static data that may be useful in the next steps.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from visualization import vis_data, vis_cluster
from collections import defaultdict, Counter
from utilities import constants, evaluation
from preprocessing import pp_action
from clustering import clu_hdbscan
import plotly.graph_objs as go
import plotly.offline as ply
import pandas as pd
import numpy as np
import json
import os

In [None]:
config = json.load(open('config.json', 'r'))
uuids_family = json.load(open(os.path.join(constants.dir_d, constants.json_labels), 'r'))
words = json.load(open(os.path.join(constants.dir_d, constants.json_words), 'r'))
ply.init_notebook_mode(connected=True)

In [None]:
min_sample_params = [50, 100, 200, 300]
min_cluster_sizes = [50, 100, 200, 300]

## Data selection

Select a subset of the original dataset. Then the selected subset will be split into a training and a testing set.


In [None]:
samples_data = pp_action.pre_process(config)
pp_action.split_show_data(samples_data)

In [None]:
uuids = samples_data.index[samples_data['selected'] == 1].tolist()
numerical_labels = samples_data.fam_num[samples_data['selected'] == 1].tolist()

## Utilities

let's define a few helper fucntions


In [None]:
def plot_surface(to_plot, a_x, a_y, title):
    data = [go.Surface(x=a_x, y=a_y, z=to_plot)]
    
    layout = go.Layout(
        title=title
    )
    
    fig = go.Figure(data=data, layout=layout)
    
    ply.iplot(fig, filename=title)    

In [None]:
def plot_surfaces(to_plot_f, to_plot_s, to_plot_q, clusters, min_sample_params, min_cluster_sizes):
    plot_surface(to_plot_f, min_sample_params, min_cluster_sizes, 'f1 scores')
    plot_surface(to_plot_s, min_sample_params, min_cluster_sizes, 'silhouette scores')
    plot_surface(to_plot_q, min_sample_params, min_cluster_sizes, 'quality')
    plot_surface(clusters, min_sample_params, min_cluster_sizes, 'clusters')

In [None]:
def test_params(data_matrix, numerical_labels, config, min_sample_params, min_cluster_sizes):
    to_plot_f = np.zeros((len(min_sample_params), len(min_cluster_sizes)))
    to_plot_s = np.zeros((len(min_sample_params), len(min_cluster_sizes)))
    to_plot_q = np.zeros((len(min_sample_params), len(min_cluster_sizes)))
    clusters = np.zeros((len(min_sample_params), len(min_cluster_sizes)))

    i = 0
    for ms in min_sample_params:
        j = 0
        
        for mc in min_cluster_sizes:

            clustering_labels, model, modifier, data, metric = clu_hdbscan.cluster(
                data_matrix,
                numerical_labels,
                config,
                {
                    'distance': 'c',
                    'min_cluster_size': mc,
                    'min_sample': ms
                }
            )
            
            n_clusters = len(set(clustering_labels)) - (1 if -1 in clustering_labels else 0)
            if n_clusters < 2:
                data = None
            
            results = evaluation.evaluate_clustering(numerical_labels, clustering_labels, data, metric, True)

            to_plot_f[i][j] = results[7]
            to_plot_s[i][j] = results[11] if data is not None else 0
            to_plot_q[i][j] = results[10]
            clusters[i][j] = n_clusters
            j += 1

        i += 1

    return to_plot_f, to_plot_s, to_plot_q, clusters

## Clustering

Here we will use HDBSCAN, a hierarchical density-based clustering algorithm.

### 256 features
Let's start by using the data matrix produced by using PCA for feature extraction.

In [None]:
data_matrix_p = np.loadtxt('data/matrix/pca_256_28582.txt')

In [None]:
to_plot_f, to_plot_s, to_plot_q, clusters = test_params(data_matrix_p, numerical_labels, config, min_sample_params, min_cluster_sizes)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, to_plot_q, clusters, min_sample_params, min_cluster_sizes)

Now let's see how it compares with the same algorithm applied to a data matrix obtained using Random Forest Classifiers for feature selection.

In [None]:
data_matrix_r = np.loadtxt('data/matrix/rfc_256_28582.txt')

In [None]:
to_plot_f, to_plot_s, to_plot_q, clusters = test_params(data_matrix_r, numerical_labels, config, min_sample_params, min_cluster_sizes)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, to_plot_q, clusters, min_sample_params, min_cluster_sizes)

Using the least relevant features as identified by the RFC

In [None]:
data_matrix_ir = np.loadtxt('data/matrix/irfc_256_28582.txt')

In [None]:
to_plot_f, to_plot_s, to_plot_q, clusters = test_params(data_matrix_ir, numerical_labels, config, min_sample_params, min_cluster_sizes)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, to_plot_q, clusters, min_sample_params, min_cluster_sizes)

### 512 features

In [None]:
data_matrix_p = np.loadtxt('data/matrix/pca_512_28582.txt')

In [None]:
to_plot_f, to_plot_s, to_plot_q, clusters = test_params(data_matrix_p, numerical_labels, config, min_sample_params, min_cluster_sizes)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, to_plot_q, clusters, min_sample_params, min_cluster_sizes)

Now let's see how it compares with the same algorithm applied to a data matrix obtained using Random Forest Classifiers for feature selection.

In [None]:
data_matrix_r = np.loadtxt('data/matrix/rfc_512_28582.txt')

In [None]:
to_plot_f, to_plot_s, to_plot_q, clusters = test_params(data_matrix_r, numerical_labels, config, min_sample_params, min_cluster_sizes)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, to_plot_q, clusters, min_sample_params, min_cluster_sizes)

### 1024 features


In [None]:
data_matrix_p = np.loadtxt('data/matrix/pca_1024_28582.txt')

In [None]:
to_plot_f, to_plot_s, to_plot_q, clusters = test_params(data_matrix_p, numerical_labels, config, min_sample_params, min_cluster_sizes)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, to_plot_q, clusters, min_sample_params, min_cluster_sizes)

Now let's see how it compares with the same algorithm applied to a data matrix obtained using Random Forest Classifiers for feature selection.

In [None]:
data_matrix_r = np.loadtxt('data/matrix/rfc_1024_28582.txt')

In [None]:
to_plot_f, to_plot_s, to_plot_q, clusters = test_params(data_matrix_r, numerical_labels, config, min_sample_params, min_cluster_sizes)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, to_plot_q, clusters, min_sample_params, min_cluster_sizes)

### 2048 features

In [None]:
data_matrix_r = np.loadtxt('data/matrix/pca_2048_28582.txt')

In [None]:
to_plot_f, to_plot_s, to_plot_q, clusters = test_params(data_matrix_r, numerical_labels, config, min_sample_params, min_cluster_sizes)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, to_plot_q, clusters, min_sample_params, min_cluster_sizes)

Now let's see how it compares with the same algorithm applied to a data matrix obtained using Random Forest Classifiers for feature selection.


In [None]:
data_matrix_r = np.loadtxt('data/matrix/rfc_2048_28582.txt')

In [None]:
to_plot_f, to_plot_s, to_plot_q, clusters = test_params(data_matrix_r, numerical_labels, config, min_sample_params, min_cluster_sizes)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, to_plot_q, clusters, min_sample_params, min_cluster_sizes)