# Birch clustering result analysis

Let's start by loading up some libraries and static data that may be useful in the next steps.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from visualization import vis_data, vis_cluster
from collections import defaultdict, Counter
from utilities import constants, evaluation
from preprocessing import pp_action
from clustering import clu_birch
import plotly.graph_objs as go
import plotly.offline as ply
import pandas as pd
import numpy as np
import json
import os

In [None]:
config = json.load(open('config.json', 'r'))
uuids_family = json.load(open(os.path.join(constants.dir_d, constants.json_labels), 'r'))
words = json.load(open(os.path.join(constants.dir_d, constants.json_words), 'r'))
ply.init_notebook_mode(connected=True)

In [None]:
thresholds = [50, 100, 150, 300, 500, 1000, 2000, 3000]
branching_factors = [50, 100, 150, 300, 500, 1000, 2000, 3000]

## Data selection

Select a subset of the original dataset. Then the selected subset will be split into a training and a testing set.


In [None]:
samples_data = pp_action.pre_process(config)
pp_action.split_show_data(samples_data)

In [None]:
uuids = samples_data.index[samples_data['selected'] == 1].tolist()
numerical_labels = samples_data.fam_num[samples_data['selected'] == 1].tolist()

## Utilities

let's define a few helper fucntions


In [None]:
def plot_surface(to_plot, a_x, a_y, title):
    data = [go.Surface(x=a_x, y=a_y, z=to_plot)]
    
    layout = go.Layout(
        title=title
    )
    
    fig = go.Figure(data=data, layout=layout)
    
    ply.iplot(fig, filename=title)    

In [None]:
def plot_surfaces(to_plot_f, to_plot_s, clusters, thresholds, branching_factors):
    plot_surface(to_plot_f, thresholds, branching_factors, 'f1 scores')
    plot_surface(to_plot_s, thresholds, branching_factors, 'silhouette scores')
    plot_surface(clusters, thresholds, branching_factors, 'clusters')

In [None]:
def test_params(data_matrix, numerical_labels, config, thresholds, branching_factors):
    to_plot_f = np.zeros((len(thresholds), len(branching_factors)))
    to_plot_s = np.zeros((len(thresholds), len(branching_factors)))
    clusters = np.zeros((len(thresholds), len(branching_factors)))

    i = 0
    for t in thresholds:
        j = 0
        
        for b in branching_factors:

            clustering_labels, model, modifier, data, metric = clu_birch.cluster(
                data_matrix,
                numerical_labels,
                config,
                {
                    'num_clusters': 7,
                    'threshold': t,
                    'branching_factor': b
                }
            )
            
            n_clusters = len(set(clustering_labels)) - (1 if -1 in clustering_labels else 0)

            if n_clusters < 2:
                data = None
            results = evaluation.evaluate_clustering(numerical_labels, clustering_labels, data, metric, True)

            to_plot_f[i][j] = results[7]
            to_plot_s[i][j] = results[8] if data is not None else 0
            clusters[i][j] = n_clusters
            j += 1

        i += 1

    return to_plot_f, to_plot_s, clusters

## Clustering

Here we will use Birch, a hierarchical density-based clustering algorithm.

### 128 features
Let's start by using the data matrix produced by using PCA for feature extraction.

In [None]:
data_matrix_p = np.loadtxt('data/matrix/pca_128_1209.txt')

In [None]:
to_plot_f, to_plot_s, clusters = test_params(data_matrix_p, numerical_labels, config, thresholds, branching_factors)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, clusters, thresholds, branching_factors)

Now let's see how it compares with the same algorithm applied to a data matrix obtained using Random Forest Classifiers for feature selection.

In [None]:
data_matrix_r = np.loadtxt('data/matrix/rfc_128_1209.txt')

In [None]:
to_plot_f, to_plot_s, clusters = test_params(data_matrix_r, numerical_labels, config, thresholds, branching_factors)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, clusters, thresholds, branching_factors)

### 256 features


In [None]:
data_matrix_p = np.loadtxt('data/matrix/pca_256_1209.txt')

In [None]:
to_plot_f, to_plot_s, clusters = test_params(data_matrix_p, numerical_labels, config, thresholds, branching_factors)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, clusters, thresholds, branching_factors)

Now let's see how it compares with the same algorithm applied to a data matrix obtained using Random Forest Classifiers for feature selection.

In [None]:
data_matrix_r = np.loadtxt('data/matrix/rfc_256_1209.txt')

### 512 features

In [None]:
data_matrix_p = np.loadtxt('data/matrix/pca_512_1209.txt')

In [None]:
to_plot_f, to_plot_s, clusters = test_params(data_matrix_p, numerical_labels, config, thresholds, branching_factors)

In [None]:
plot_surfaces(to_plot_f, to_plot_s, clusters, thresholds, branching_factors)

In [None]:
data_matrix_r = np.loadtxt('data/matrix/rfc_512_1209.txt')